示例#1
0
 def import_governingparties(self):
     """ This requires that governments & parties have already been imported """
     path = os.path.dirname(os.path.realpath(__file__))
     f = open(os.path.join(path, self.GPS_FILENAME))
     for line in f.readlines():
         line = line.strip().decode('utf8')
         if not line or line[0] == '#':
             continue
         (party, government, begin, end) = line.split('\t')
         try:
             party = Party.objects.get(abbreviation=party)
         except Party.DoesNotExist:
             raise ParseError(
                 'Invalid party %s in initial governing party data' % party)
         try:
             government = Government.objects.get(name=government)
         except Government.DoesNotExist:
             raise ParseError(
                 'Invalid government %s in initial governing party data' %
                 government)
         try:
             gp = GoverningParty.objects.get(party=party,
                                             government=government)
             if not self.replace:
                 continue
         except GoverningParty.DoesNotExist:
             gp = GoverningParty(party=party, government=government)
         gp.begin = begin
         if end == "None":
             gp.end = None
         else:
             gp.end = end
         self.logger.info("importing governing party %s / %s - %s" %
                          (gp.party, gp.begin, gp.end))
         gp.save()
示例#2
0
def get_field_el(doc, field):
    # Get "doclist-items" elements listed by table headers (th)
    el_list = doc.xpath(
        '//div[@class="doclist-items"]/div[@class="listborder"]/table//th')
    for el in el_list:
        s = el.text.split(':')[0].strip()
        if s == FIELD_MAP[field]:
            # td follows th, so positional selection can be used
            td = el.getnext()
            if td.tag != 'td':
                raise ParseError('expecting a td element')
            return td
    return None
示例#3
0
文件: vote.py 项目: kansanmuisti/kamu
 def _import_one(self, vote_id):
     (year, plsess, nr) = vote_id.split('/')
     url = self.URL_BASE + self.VOTE_URL % (int(year), plsess, int(nr))
     el_list, next_link = self.read_listing(self.CACHE_DIR, url)
     if len(el_list) != 1:
         raise ParseError("vote with id %s not found" % vote_id, url=url)
     el = el_list[0]
     vote_id_str = "%s/%s/%s" % (plsess, year, nr)
     got_id = "%s/%d" % (el['plsess'], el['number'])
     if vote_id_str != got_id:
         raise ParseError("invalid vote returned (wanted %s, got %s)" %
                          (vote_id_str, got_id),
                          url=url)
     info = {'plsess': el['plsess'], 'number': el['number']}
     info['link'] = el['results_link']
     try:
         plv = self.import_session(info)
     except ParseError as e:
         e.url = url
         raise
     db.reset_queries()
     return plv
示例#4
0
    def import_sgml_doc(self, info, current_version):
        url = DOC_DL_URL % (info['type'], info['id'])
        if not current_version:
            current_version = '0'
        xml_fn = self.download_sgml_doc(info,
                                        url,
                                        current_version=current_version)
        if not xml_fn:
            return None
        f = open(xml_fn, 'r')
        root = html.fromstring(f.read())
        f.close()

        el_list = root.xpath('.//ident/nimike')
        assert len(el_list) >= 1
        el = el_list[0]
        text = self.clean_text(el.text)
        self.logger.info('%s %s: %s' % (info['type'], info['id'], text))
        info['subject'] = text

        if info['type'] == 'KK':
            self.parse_question_text(root, info)
        else:
            if info['type'].endswith('VM'):
                xpath_list = ('.//asianvir', './/emasianv')
            else:
                xpath_list = ('.//peruste', './/paasis', './/yleisper')
            for xpath in xpath_list:
                l = root.xpath(xpath)
                if not len(l):
                    continue
                assert len(l) == 1
                target_el = l[0]
                break
            else:
                raise ParseError('Summary section not found')

            info['summary'] = self.parse_te_paragraphs(target_el)
        if info['type'] in ('KK', 'LA'):
            if info['type'] == 'KK':
                author_root = root.xpath(".//kysosa[@kieli='suomi']")
                assert len(author_root) == 1
                author_root = author_root[0]
            else:
                author_root = root

            info['author'] = self.parse_author(author_root)
            info['signatures'] = self.parse_signatures(author_root)
        return info
示例#5
0
    def import_members(self, **args):
        if not MemberActivityType.objects.count():
            import_activity_types()
            self.logger.info("%d activity types imported" %
                             MemberActivityType.objects.count())

        self.logger.debug("fetching MP list")
        if args.get('full', False):
            date_str = '24.03.1999'
        else:
            term = Term.objects.latest()
            date_str = term.begin.strftime('%d.%m.%Y')
        list_url = self.URL_BASE + self.LIST_URL % date_str
        s = self.open_url(list_url, 'member')
        doc = html.fromstring(s)
        doc.make_links_absolute(list_url)
        link_list = doc.xpath("//a[@target='edus_main']")
        for l in link_list:
            name = l.text.strip().replace('&nbsp', '')
            url = l.attrib['href']
            if 'single' in args and not args['single'].lower() in name.lower():
                continue
            self.logger.debug("fetching MP %s" % name)

            name = re.sub(r'\s*\([\w\d. ,]+\)\s*', '', name)
            last_name, given_names = name.split(',')
            given_names = given_names.strip()
            last_name = last_name.strip()
            try:
                Member.objects.get(surname=last_name, given_names=given_names)
                if not self.replace:
                    continue
            except Member.DoesNotExist:
                pass

            s = self.open_url(url, 'member')
            doc = html.fromstring(s)
            el = doc.xpath("//frame[@name='vasen2']")
            if len(el) != 1:
                raise ParseError("Invalid MP info frame")
            s = el[0].attrib['src']
            m = re.search(r'hnro=(\d+)', s)
            if not m:
                raise ParseError("MP ID not found")
            mp_id = int(m.groups()[0])
            # FIXME: New MPs that replace the euro-MEPs -- Remove this later
            if mp_id in (1275, 1276, 1277):
                if datetime.now() < datetime(year=2014, month=7, day=4):
                    continue
            mp_info = self.fetch_member(mp_id)
            if not mp_info:
                continue
            if 'dry_run' in args and not args['dry_run']:
                self.save_member(mp_info)
            elif 'dry_run' in args and args['dry_run']:
                pprint.pprint(mp_info)

        self.logger.info('Imported {0} MPs'.format(len(link_list)))

        self.logger.info(
            "Adding Carl Haglund as a pseudo-MP for purposes of minister counting"
        )

        haglund_info = {
            'birthdate':
            '1979-03-29',
            'birthplace':
            'Espoo',
            'email':
            '*****@*****.**',
            'given_names':
            'Carl Christoffer',
            'home_county':
            'Espoo',
            'id':
            "nonmp_0001",
            'info_url':
            'http://valtioneuvosto.fi/hallitus/jasenet/puolustusministeri/fi.jsp',
            'name':
            'Carl Haglund',
            # Following two are a minimal hack to make Carl only show up in the
            # minister calculations. See save_member
            'party':
            'r',
            'parties': {},
            'phone':
            '09 1608 8284',
            'portrait':
            'http://valtioneuvosto.fi/documents/10184/143444/Carl+Haglund/694456f8-8ce7-453a-bf8d-161c4a4d01ca?t=1404465254000&width=500',
            'districts': {},
            'posts': [{
                'begin': datetime(year=2012, month=7, day=5).date(),
                'end': None,
                'label': 'puolustusministeri',
                'role': 'minister'
            }],
            'surname':
            'Haglund'
        }
        self.save_member(haglund_info)
示例#6
0
    def fetch_member(self, mp_id):
        url = self.URL_BASE + self.MP_INFO_URL % mp_id
        s = self.open_url(url, 'member')
        doc = html.fromstring(s)
        doc.make_links_absolute(url)

        mp_info = {'id': mp_id, 'info_url': url}

        name_el = doc.xpath('//div[@id="content"]/div[@class="header"]/h1')
        if len(name_el) != 1:
            raise ParseError("MP name not found")
        name, pg = name_el[0].text.strip().split('/')
        name = name.strip()
        pg = pg.strip()
        names = name.split()
        surname, first_names = names[-1], ' '.join(names[0:-1])
        mp_info['name'] = "%s %s" % (surname, first_names)

        name_el = get_field_el(doc, 'name')
        if name_el == None:
            self.logger.warning("MP info element not found")
            return None

        surname, given_names = name_el.text.strip().split(', ')
        if '(' in given_names:
            given_names = given_names.split('(')[0].strip()
        if not given_names and surname == 'Saarikangas':
            given_names = 'Martin'
        mp_info['surname'] = surname
        mp_info['given_names'] = given_names.strip()

        td = get_field_el(doc, 'phone')
        if td is not None:
            mp_info['phone'] = td.text.strip()

        td = get_field_el(doc, 'email')
        if td is not None:
            mp_info['email'] = td.text_content().strip().replace('[at]', '@')

        td = get_field_el(doc, 'birth')
        text = td.text.strip()
        # First try to match the birthplace, too
        m = re.match(self.DATE_MATCH + r'\s+(\w+)', text, re.U)
        if not m:
            m = re.match(self.DATE_MATCH, text, re.U)
            if not m:
                raise ParseError("Invalid MP birth date")
        (day, mon, year) = m.groups()[0:3]
        mp_info['birthdate'] = '-'.join((year, mon, day))
        if len(m.groups()) == 4:
            mp_info['birthplace'] = m.groups()[3]

        # Electorate

        td = get_field_el(doc, 'home_county')
        if td is not None:
            mp_info['home_county'] = td.text.strip()

        td = get_field_el(doc, 'districts')
        el_list = td.xpath('ul/li')
        da_list = []
        for el in el_list:
            district, date_range = el.text.strip().split('  ')
            dates = date_range.split(' - ')
            da = {'district': district, 'begin': self.convert_date(dates[0])}
            if len(dates) > 1:
                da['end'] = self.convert_date(dates[1])
            da_list.append(da)
        mp_info['districts'] = da_list

        # Party memberships

        td = get_field_el(doc, 'parties')
        el_list = td.xpath('ul/li')
        pa_list = []
        for el in el_list:
            a_el = el.xpath('a')
            if not a_el:
                # Strip text within parentheses
                m = re.match(r'([^\(]*)\([^\),]+\)(.+)', el.text)
                if m:
                    text = ' '.join(m.groups())
                else:
                    text = el.text
                m = re.match(r'(\D+)\s+([\d\.,\s\-]+)$', text.strip())
                party, date_ranges = (m.groups()[0], m.groups()[1])
            if not a_el:
                # Strip text within parentheses
                m = re.match(r'([^\(]*)\([^\),]+\)(.+)', el.text)
                if m:
                    text = ' '.join(m.groups())
                else:
                    text = el.text
                m = re.match(r'(\D+)\s+([\d\.,\s\-]+)$', text.strip())
                party, date_ranges = (m.groups()[0], m.groups()[1])
            else:
                a_el = a_el[0]
                party, date_ranges = (a_el.text.strip(), a_el.tail.strip())

            # Strip text within parentheses
            m = re.match(r'([^\(]*)\([^\)]+\)(.+)', date_ranges)
            if m:
                date_ranges = ' '.join(m.groups())

            for dr in date_ranges.split(','):
                pa = {'party': party}
                dates = dr.strip().split(' - ')
                pa['begin'] = self.convert_date(dates[0])
                if len(dates) > 1:
                    pa['end'] = self.convert_date(dates[1])
                pa_list.append(pa)
        mp_info['parties'] = pa_list

        img_el = doc.xpath('//div[@id="submenu"]//img[@class="portrait"]')
        mp_info['portrait'] = img_el[0].attrib['src']

        # Committee memberships
        mp_info['posts'] = self.resolve_memberships(doc)

        mp_info['gender'] = figure_mp_gender(name)

        return mp_info
示例#7
0
    def handle_processing_stages(self, info, html_doc):
        doc_name = "%s %s" % (info['type'], info['id'])

        status_map = {1: 'upcoming', 2: 'in_progress', 3: 'finished'}
        names = {
            'vireil': ('intro', ('Annettu eduskunnalle', 'Aloite jätetty')),
            'lahete': ('debate', 'Lähetekeskustelu'),
            'valiok': ('committee', 'Valiokuntakäsittely'),
            'poydal': ('agenda', 'Valiokunnan mietinnön pöydällepano'),
            '1kasit': ('1stread', 'Ensimmäinen käsittely'),
            '2kasit': ('2ndread', 'Toinen käsittely'),
            '3kasit': ('3ndread', 'Kolmas käsittely'),
            'paat': ('finished', None),
            'akasit': ('onlyread', 'Ainoa käsittely'),
            'akja2k': ('only2read', 'Ainoa ja toinen käsittely'),
            '3kjaak': ('only3read', 'Kolmas ja ainoa käsittely'),
            'peru': ('cancelled', 'Ilmoitus peruuttamisesta'),
            'rauennut': ('lapsed', None),
            'raue': ('lapsed', None),
            'jatlep': ('suspended', None),
        }
        finishing_phases = ('3ndread', '2ndread', 'onlyread', 'only2read',
                            'only3read', 'cancelled', 'lapsed', 'lapsed')

        img_els = html_doc.xpath("//div[@id='vepsasia-kasittely']/img")
        assert len(img_els)
        phases = []
        for img in img_els:
            s = img.attrib['src']
            m = re.match('/thwfakta/yht/kuvat/palkki/ve([a-z0-9]+)_(\d)\.gif',
                         s)
            phase = m.groups()[0]
            status = int(m.groups()[1])
            status = status_map[status]
            if not phase in names:
                raise ParseError("unknown processing phase %s" % phase)
            l = names[phase]
            phase = l[0]
            phases.append((phase, status, l[1]))

        last_phase = phases[-1][0]
        phase_list = []
        for idx, (phase, status, el_name) in enumerate(phases):
            if not el_name or status not in ('in_progress', 'finished'):
                continue
            box_el_list = html_doc.xpath("//div[@class='listborder']//h2")
            # quirks
            if doc_name in ('HE 25/2009', 'HE 57/2014',
                            'HE 29/2014') and phase == '2ndread':
                el_name = names['akja2k'][1]
            if doc_name in ('HE 29/2014', 'HE 3/2014', 'HE 215/2013',
                            'HE 203/2013', 'HE 288/2014',
                            'HE 297/2014') and phase == 'only2read':
                phase = '2ndread'
                el_name = names['2kasit'][1]
            if doc_name == 'HE 112/2011':
                if phase == '2ndread':
                    continue
                if phase == '1stread':
                    phase = 'onlyread'
                    el_name = names['akasit'][1]

            finishing_phase = None
            for box_el in box_el_list:
                s = box_el.text_content().strip().strip('.')
                if isinstance(el_name, tuple):
                    if s not in el_name:
                        continue
                else:
                    if el_name != s:
                        continue
                parent_el = box_el.getparent().getparent()
                break
            else:
                if phase == 'committee' and last_phase in ('cancelled',
                                                           'lapsed'):
                    continue
                self.logger.warning("processing stage '%s' not found" %
                                    el_name)
                continue

            phase_info = {}

            if phase == 'committee':
                el_list = parent_el.xpath(".//div[contains(., 'Valmistunut')]")
                date_list = []
                for date_el in el_list:
                    date = date_el.tail.strip()
                    (d, m, y) = date.split('.')
                    date = '-'.join((y, m, d))
                    date_list.append(date)
                if not date_list and last_phase in ('cancelled', 'lapsed'):
                    continue
                if not date_list:
                    self.logger.warning("date not found for committee phase")
                    continue
                date = max(date_list)
            else:
                date_el = parent_el.xpath(".//div[.='Pvm']")
                assert len(date_el) == 1
                arr = date_el[0].getparent().text_content().strip().split()
                assert len(arr) >= 2
                (d, m, y) = arr[-1].split('.')
                date = '-'.join((y, m, d))

                min_el = parent_el.xpath(".//div[.='Istuntopöytäkirja']")
                if min_el and phase != 'cancelled':
                    links = min_el[0].getparent().xpath('a')
                    assert len(links) >= 1
                    plsess_list = []
                    for l in links:
                        href = l.attrib['href']
                        m = re.search(r'\{KEY\}=PTK\+(\d+/\d{4})', href)
                        assert m, 'Plenary session id not found (phase %s)' % phase
                        plsess_id = m.groups()[0]
                        m = re.search(r'\{KNRO\}=(\d+)', href)
                        assert m, 'Plenary session item number not found (phase %s)' % phase
                        plitem_nr = m.groups()[0]
                        plsess_list.append({
                            'plsess': plsess_id,
                            'index': plitem_nr
                        })
                    phase_info['plsess_items'] = plsess_list

            if phase == 'finished':
                finishing_phase = idx

            phase_info.update({'index': idx, 'phase': phase, 'date': date})
            phase_list.append(phase_info)
            #print "%s: %s" % (phase, date)

        if not finishing_phase:
            for p in phases:
                if p[0] == 'finished' and p[1] != 'upcoming':
                    is_finished = True
                    break
            else:
                is_finished = False
            if is_finished:
                for p in phase_list:
                    if p['phase'] in finishing_phases:
                        finishing_phase = p
                        break
                assert finishing_phase, 'Finishing phase not found'

                idx = finishing_phase['index'] + 1
                max_idx = max([x['index'] for x in phase_list])
                assert max_idx < idx
                phase_list.append({
                    'index': idx,
                    'phase': 'finished',
                    'date': finishing_phase['date']
                })

        info['phases'] = phase_list
示例#8
0
    def import_doc(self, info):
        url = DOC_DL_URL % (info['type'], info['id'])
        info['info_link'] = url
        self.fix_id_quirks(info)
        if not should_download_doc(info):
            self.logger.warning("skipping %s %s" % (info['type'], info['id']))
            return None

        origin_id = "%s %s" % (info['type'], info['id'])
        try:
            doc = Document.objects.get(origin_id=origin_id)
        except Document.DoesNotExist:
            doc = Document(origin_id=origin_id)

        if 'update_time' in info:
            doc.mark_checked()
            if doc.last_modified_time and doc.last_modified_time >= info[
                    'update_time'] and not self.replace:
                self.logger.debug("%s %s not updated" %
                                  (info['type'], info['id']))
                doc.save(update_fields=['last_checked_time'])
                return None
            else:
                self.logger.debug(
                    "%s %s updated %s (checked %s)" %
                    (info['type'], info['id'], info['update_time'],
                     doc.last_modified_time))
        else:
            if doc.pk and not self.replace:
                return doc

        doc.type = DOC_TYPES[info['type']]
        doc.name = origin_id

        info = self.fetch_processing_info(info)

        if info['type'] == 'HE':
            self.import_he(info)
        else:
            ret = self.import_sgml_doc(info, current_version=doc.version)
            if not ret:
                return None

        doc.version = info.get('doc_version', None)
        doc.subject = info['subject']
        for attr_name in ('summary', 'question', 'answer', 'answerer_name',
                          'answerer_title'):
            if attr_name in info:
                setattr(doc, attr_name, info[attr_name])
        if 'error' in info:
            doc.error = info['error']
        else:
            doc.error = None
        # Figure out the document date through the intro stage.
        for st in info['phases']:
            if st['phase'] == 'intro':
                doc.date = st['date']
                break
        if doc.date is None:
            raise ParseError("Document date could not be determined")
        doc.info_link = info['info_link']
        if 'sgml_link' in info:
            doc.sgml_link = info['sgml_link']
        if 'author' in info:
            doc.author = Member.objects.get(origin_id=info['author']['id'])

        doc.mark_modified()
        doc.save()

        self.save_stages(doc, info)
        self.save_keywords(doc, info)
        if 'signatures' in info:
            self.save_signatures(doc, info)

        # The keywords are saved only at this point. We'll save it again in order
        # to create the proper KeywordActivity objects.
        doc._updated = True
        doc.save()

        return doc
示例#9
0
文件: vote.py 项目: kansanmuisti/kamu
    def import_session(self, info):
        if not info['plsess'] in self.plsess_by_id:
            try:
                plsess = PlenarySession.objects.get(origin_id=info['plsess'])
            except PlenarySession.DoesNotExist:
                raise Exception(
                    "Vote %s refers to unexisting plenary session" %
                    (info['number'], info['plsess']))
            self.plsess_by_id[info['plsess']] = plsess

        plsess = self.plsess_by_id[info['plsess']]
        try:
            pv = PlenaryVote.objects.get(plsess=plsess, number=info['number'])
            if not self.replace:
                return
        except PlenaryVote.DoesNotExist:
            pv = PlenaryVote(plsess=plsess, number=info['number'])

        self.logger.info('processing plenary vote %s/%d' %
                         (plsess.name, info['number']))
        s = self.open_url(info['link'], self.CACHE_DIR)

        doc = html.fromstring(s)

        hdr_el = doc.xpath('//table[@class="voteResults"]')
        if len(hdr_el) < 1:
            raise ParseError('vote header not found')
        hdr_el = hdr_el[0]
        s = self.clean_text(hdr_el.xpath('caption')[0].text)
        m = re.match(r'Äänestys (\d+) klo (\d{2}\.\d{2})', s, re.U)
        info['time'] = m.groups()[1]

        el = hdr_el.xpath('tbody/tr')[0].xpath('td')[1]
        s = self.clean_text(el.text)
        info['subject'] = s

        el = hdr_el.xpath('tbody/tr/td/strong')[0]
        s = self.clean_text(el.text)
        step = PROCESSING_STEP[s]

        el = doc.xpath("//th[contains(., 'nestysasettelu')]")[0]
        s = self.clean_text(el.getnext().text)
        info['setting'] = s

        vote_list_el = doc.xpath('//table[@class="statistics"]/tbody/tr')
        if len(vote_list_el) < 196 / 2 or len(vote_list_el) > 200 / 2:
            raise ParseError('vote list not found')
        votes = []
        for row_el in vote_list_el:
            td_list = row_el.xpath('td')
            if len(td_list) != 5:
                raise ParseError('invalid vote row')
            votes.append(parse_vote(td_list[0].text, td_list[1].text))
            if td_list[3].text:
                votes.append(parse_vote(td_list[3].text, td_list[4].text))
        info['votes'] = votes

        pv.mark_modified()
        pv.mark_checked()

        self.updated += 1

        return self.save_session(pv, info)