Exemplo n.º 1
0
def process_keywords():
    mpage = http_cache.open_url(KEYWORD_URL_BASE + KEYWORD_LIST_URL, 'keyword')
    # BeautifulSoup's SGML parser will break at the following pattern,
    # so remove it before handing over for parsing
    pat = 'document.write\("<SCR"\+"IPT Language=.JavaScript. SRC=."\+"' +  \
          'http://"\+gDomain\+"/"\+gDcsId\+"/wtid.js"\+".></SCR"\+"IPT>"\);'

    massage = [(re.compile(pat), lambda match: '')]
    dir_soup = BeautifulSoup(mpage, markupMassage=massage,
                             fromEncoding='iso-8859-1',
                             convertEntities=BeautifulSoup.HTML_ENTITIES)
    dir_list = dir_soup.find('p', text='A) Valitse asiasana aakkosittain'). \
                          parent.findNextSiblings('a')

    max_len = Keyword._meta.get_field_by_name('name')[0].max_length
    trim_re = re.compile(' \[$')
    for dir_elem in dir_list:
        kpage_url = KEYWORD_URL_BASE + dir_elem['href']
        kpage = http_cache.open_url(kpage_url, 'keyword')
        ksoup = BeautifulSoup(kpage, markupMassage=massage,
                              fromEncoding='iso-8859-1',
                              convertEntities=BeautifulSoup.HTML_ENTITIES)
        anchor = ksoup.find('p', text=' Suorita haku asiasanalla:')
        elem = anchor.parent.parent.nextSibling.nextSibling
        kword_list = elem.findAll('li')
        for kword in kword_list:
            insert_keyword(kword, max_len, trim_re)
Exemplo n.º 2
0
def process_minutes(full_update):
    start_from = from_pl
    stop_after = None
    member_list = Member.objects.all()
    member_dict = {}
    for mem in member_list:
        (last, first) = mem.name.split(' ', 1)
        name = ' '.join((first, last))
        if name in member_dict:
            raise Exception()
        member_dict[name] = mem

    next_link = url_base + MINUTES_URL
    while next_link:
        (info_list, next_link) = read_listing('minutes', next_link, new_only=not full_update)
        print 'Got links for total of %d minutes' % len(info_list)
        for idx, info in enumerate(info_list):
            url = info['minutes_link']
            print '%4d. %s' % (idx, info['id'])
            if start_from:
                if info['id'] == start_from:
                    start_from = None
                else:
                    continue
            if stop_after and info['id'] != stop_after:
                return
            s = http_cache.open_url(url, 'minutes')
            tmp_url = 'http://www.eduskunta.fi/faktatmp/utatmp/akxtmp/'
            minutes = minutes_parser.parse_minutes(s, tmp_url)
            if not minutes:
                continue
            minutes['url'] = url
            try:
                mins = Minutes.objects.get(plenary_session__name=info['id'])
                if not full_update:
                    return
            except Minutes.DoesNotExist:
                mins = None
                pass
            pl_sess = insert_minutes(minutes, mins)
            try:
                for l in minutes['cnv_links']:
                    print l
                    s = http_cache.open_url(l, 'minutes')
                    disc = minutes_parser.parse_discussion(s, l)
                    insert_discussion(full_update, pl_sess, disc,
                                      minutes['cnv_links'].index(l),
                                      member_dict)
            except:
                Minutes.objects.get(plenary_session=pl_sess).delete()
                Statement.objects.filter(plenary_session=pl_sess).delete()
                raise
            transaction.commit()
            db.reset_queries()
            if until_pl and info['id'] == until_pl:
                stop_after = until_pl
Exemplo n.º 3
0
def read_listing(list_type, url, new_only=False):
    assert list_type in ('minutes', 'votes', 'docs')

    ret = []

    while True:
        s = http_cache.open_url(url, list_type, skip_cache=new_only)
        doc = html.fromstring(s)
        el_list = doc.xpath(".//div[@class='listing']/div/p")
        doc.make_links_absolute(url)

        for el in el_list:
            link = {}

            parsed_el = process_list_element(list_type, el)
            ret.append(parsed_el)

        # Check if last page of links
        if len(el_list) >= 50:
            fwd_link = doc.xpath(".//input[@name='forward']")
            url = url_base + fwd_link[0].attrib['value']
        else:
            url = None
            break
        if new_only:
            break

    return (ret, url)
Exemplo n.º 4
0
def get_wikipedia_links():
    MP_LINK = 'http://fi.wikipedia.org/wiki/Luokka:Nykyiset_kansanedustajat'

    print "Populating Wikipedia links to MP's..."
    mp_list = Member.objects.all()
    mp_names = [mp.name for mp in mp_list]
    s = http_cache.open_url(MP_LINK, 'misc')
    doc = html.fromstring(s)
    links = doc.xpath(".//table//a[starts-with(@href, '/wiki')]")
    doc.make_links_absolute(MP_LINK)
    for l in links:
        href = l.attrib['href']
        if 'Toiminnot:Haku' in href:
            continue
        name = l.text
        if '(' in name:
            name = name.split('(')[0].strip()
        a = name.split()
        a = list((a[-1],)) + a[0:-1]
        name = ' '.join(a)
        try:
            mp = Member.objects.get(name=name)
        except Member.DoesNotExist:
            matches = difflib.get_close_matches(name, mp_names, cutoff=0.8)
            if len(matches) > 1:
                raise Exception("Multiple matches for '%s'" % name)
            elif not matches:
                print "No match found for '%s'" % name
                continue
            print("Mapping '%s' to %s'" % (name, matches[0]))
            mp = Member.objects.get(name=matches[0])
        mp.wikipedia_link = href
        get_mp_homepage_link(mp)
        mp.save()
Exemplo n.º 5
0
def parse():
    s = http_cache.open_url(CSV_URL, 'opinions')
    src, c = QuestionSource.objects.get_or_create(name='MTV3 vaalikone', year=2007,
                                                  url_name='mtv2007')
    reader = csv.reader(s.splitlines(), delimiter=',', quotechar='"')
    reader.next()
    hdr = reader.next()
    # 2d questions
    q_list = [idx for idx, s in enumerate(hdr) if s.startswith('[2d_x]')]
    i_list = [idx for idx, s in enumerate(hdr) if s.startswith('[2d_y]')]

    # 1d questions
    q2_list = [idx for idx, s in enumerate(hdr) if s.startswith('[1d_x]')]
    q_list.extend(q2_list)
    i_list.extend([-1] * len(q2_list))

    o_list = range(0, len(q_list))

    txt_list = [hdr[idx][7:].replace('_', ',') for idx in q_list]
    for i in o_list:
        if q_list[i] in range(56, 64):
            txt_list[i] = "Hallituspuolueena " + txt_list[i]

    q_info_list = zip(q_list, i_list, o_list, txt_list)
    for q in q_info_list:
        insert_question(src, q)
    for row in reader:
        handle_row(src, q_info_list, row)
Exemplo n.º 6
0
def process_mp(mp, url):
    tm = TermMember.objects.filter(member=mp, term=term)
    if not tm:
        return
    s = http_cache.open_url(url, 'funding')
    doc = html.fromstring(s)

    election_budget = None

    rows = doc.xpath(".//tr")
    for row in rows:
        th = row.getchildren()[0]
        if not th.tag == 'th' or not th.text:
            continue
        if th.text.strip().startswith('2. Vaalikampanjan rahoitus'):
            scr = row[1][0]
            assert scr.tag == 'script'
            m = re.search(r"addSpaces\('([\d,.]*)'\)", scr.text)
            assert m
            s = m.groups()[0].replace(',', '.')
            if not s:
                continue
            election_budget = float(s)
    if not election_budget:
        return

    global mp_count
    mp_count += 1
    ms = MemberStats.objects.get(begin=term.begin, end=term.end, member=mp)
    ms.election_budget = election_budget
    ms.save()
    print "%30s: %.0f" % (mp.name, election_budget)
Exemplo n.º 7
0
def process_parties(db_insert):
    s = http_cache.open_url(party_url_base + party_list_url, 'party')
    parser = party_list_parser.Parser()
    parser.feed(s)
    parser.close()
    party_list = parser.get_list()
    parser = party_info_parser.Parser()
    for party in party_list:
        if party['name'] == 'vr':
            continue
        s = http_cache.open_url(party_url_base + party['info_link'], 'party')
        parser.reset()
        parser.feed(s)
        parser.close()
        party.update(parser.get_desc())

        logo_url = party_url_base + party['logo']
        fname = party['name'].encode('iso8859-1') + '.jpg'
        party['logo'] = fname
        fname = static_path + party_logo_path + fname
        create_path_for_file(fname)
        if not os.path.exists(fname):
            print 'Fetching logo ' + logo_url
            s = http_cache.open_url(logo_url, 'party')
            f = open(fname, 'wb')
            f.write(s)
            f.close()
        else:
            print 'Skipping logo ' + party['logo']

        if not db_insert:
            continue

        try:
            p = Party.objects.get(name=party['name'])
        except Party.DoesNotExist:
            p = None
        if not p:
            p = Party()
            p.name = party['name']
        if not p.full_name:
            p.full_name = party['fullname']
        p.logo = party_logo_path + party['logo']
        p.info_link = party_url_base + party['info_link']
        p.save()

    return party_list
Exemplo n.º 8
0
def get_mp_homepage_link(mp, force_update=False):
    if mp.homepage_link and not force_update:
        return
    s = http_cache.open_url(mp.wikipedia_link, 'misc')
    doc = html.fromstring(s)
    b = doc.xpath(".//b[.='Kotisivu']")
    if not b:
        return
    elem = b[0].getparent()
    href = elem.getnext().getchildren()[0].attrib['href']
    print "%s: %s" % (mp.name, href)
    # Try to fetch the homepage
    s = http_cache.open_url(href, 'misc', skip_cache=True, error_ok=True)
    if s:
        mp.homepage_link = href
    else:
        print "\tFailed to fetch"
Exemplo n.º 9
0
def parse():
    s = http_cache.open_url(OPTIONS_URL, 'opinions')
    src, c = QuestionSource.objects.get_or_create(name='HS vaalikone', year=2011,
                                                  url_name='hs2011')
    parse_option_order(s, src)

    s = http_cache.open_url(CSV_URL, 'opinions')
    reader = csv.reader(s.splitlines(), delimiter=',', quotechar='"')
    hdr = reader.next()
    questions = [s.decode('utf8') for s in hdr[16::3]]
    q_list = []
    for idx, q in enumerate(questions):
        if idx in SKIP_QUESTIONS:
            continue
        q_obj = Question.objects.get(source=src, text=q)
        assert q_obj.order == idx
    q_list = Question.objects.filter(source=src).order_by('order')
    writer = None
    for row in reader:
        row = [(s.decode('utf8'), None)[s == '-'] for s in row]
        handle_row(src, row, writer)
Exemplo n.º 10
0
def download_processing_info(doc):
    url = DOC_PROCESS_URL % (doc.type, doc.name)
    logger.info('updating processing info for %s' % doc)
    s = http_cache.open_url(url, 'docs')
    html_doc = html.fromstring(s)

    ret = {}

    subj_el = html_doc.xpath(".//div[@class='listing']/div[1]/div[1]/h3")
    assert len(subj_el) == 1
    ret['subject'] = clean_string(subj_el[0].text)

    for box_el in html_doc.xpath(".//div[@class='listborder']"):
        hdr_el = box_el.xpath("./div[@class='header']")
        if not hdr_el:
            continue
        assert len(hdr_el) == 1
        hdr = hdr_el[0].text_content().strip()
        if doc.type == 'VK':
            date_hdr_str = 'Kysymys j'
        elif doc.type == 'HE':
            date_hdr_str = 'Annettu eduskunnalle'
        elif doc.type == 'VNT':
            date_hdr_str = 'Ilmoitettu saapuneeksi'
        else:
            date_hdr_str = 'Aloite j'
        if hdr.startswith(date_hdr_str):
            date_el = box_el.xpath(".//div[.='Pvm']")
            assert len(date_el) == 1
            date = date_el[0].tail.strip()
            (d, m, y) = date.split('.')
            ret['date'] = '-'.join((y, m, d))
    assert 'date' in ret

    kw_list = []
    kw_el_list = html_doc.xpath(".//div[@id='vepsasia-asiasana']//div[@class='linkspace']/a")
    for kw in kw_el_list:
        kw = kw.text.strip()
        kw_list.append(kw)
    assert len(kw_list)
    ret['keywords'] = kw_list

    return ret
Exemplo n.º 11
0
def process_counties(db_insert):
    s = http_cache.open_url(STAT_URL_BASE + STAT_COUNTY_URL, 'county')

    # strip first 4 lines of header and any blank/empty lines at EOF
    for line in s.rstrip().split('\n')[4:]:
        dec_line = line.decode('iso8859-1').rstrip().split('\t')
        (county_id, county_name, district_id, district_name) = dec_line

        if not db_insert:
            continue

        try:
            c = County.objects.get(name=county_name)
        except:
            c = None
        if not c:
            c = County()
            c.name = county_name
        c.district = district_name
        c.save()
Exemplo n.º 12
0
def parse_district(district):
    base = URL_BASE % district
    s = http_cache.open_url(base, 'opinions')
    doc = html.fromstring(s)
    doc.make_links_absolute(base)

    el_list = doc.xpath(".//td[@class='em-cell-name']/a")
    cand_list = []
    for el in el_list:
        href = el.attrib['href']
        # Party links have rt-2 in them
        if '.rt-2.' in href:
            continue
        assert '.rt-1.' in href
        m = re.match(r'(\d+) ([\w -]+), ([\w \-\"\.()]+)$', el.text, re.U)
        if not m:
            print "Skipping %s" % el.text.encode('utf8')
            continue
        last_name, first_name = m.groups()[1:]
        cand_list.append((last_name, first_name, href))
    return cand_list
Exemplo n.º 13
0
def process_district(district):
    url = URL_BASE % district
    s = http_cache.open_url(url, 'funding')
    doc = html.fromstring(s)
    doc.make_links_absolute(url)

    el_list = doc.xpath(".//div[@class='listing_table']")
    for el in el_list:
        rows = el.xpath(".//tr")
        for row in rows:
            ch = row.getchildren()[0]
            if ch.tag == 'th':
                continue
#            print ch.text
            m = re.match('([\w -]+)[ ]{2,}([\w -]+)', ch.text, re.U)
            if not m:
                print "Skipping %s" % ch.text
                continue
            fnames = m.groups()[0].strip()
            lname = m.groups()[1].strip()
            name = "%s %s" % (lname, fnames.split(' ')[0])
            name = parse_tools.fix_mp_name(name)

            mp = Member.objects.filter(name=name)
            if not mp:
                continue
            mp = mp[0]
            links = row.xpath('.//a')
            link = None
            for l in links:
                href = l.attrib['href']
                if l.text.strip() == "Ennakkoilmoitus":
                    if not link:
                        link = href
                elif l.text.strip() == "Vaalirahoitusilmoitus":
                    link = href
                else:
                    assert False
            assert link
            process_mp(mp, link)
Exemplo n.º 14
0
def parse_mp(src, lname, fname, href):
    name = "%s %s" % (lname, fname)
    name = parse_tools.fix_mp_name(name)
    mp = Member.objects.filter(name=name)
    if not mp:
        return
    mp = mp[0]

    if name in mp_dict:
        mp = mp_dict[name]
        mp.found = True

    print mp

    s = http_cache.open_url(href, 'opinions')
    doc = html.fromstring(s)
    q_list = doc.xpath(".//div[@class='em-compare-container']")
    for q_idx, q_el in enumerate(q_list):
        if q_idx in SKIP_QUESTIONS:
            continue

        el = q_el.xpath("./h3")
        assert len(el) == 1
        q_text = el[0].text.strip()
        m = re.match(r'\d+\.\s+(.+)', q_text, re.U)
        assert m
        q_text = m.groups()[0]

        a_list = q_el.xpath(".//td[@class='em-text']")
        a_text_list = []
        for a_idx, a_text in enumerate(a_list):
            a_text = a_text.text.strip()
            a_text_list.append(a_text)

        q_obj = add_question(src, q_text, q_idx, a_text_list)

        a_list = q_el.xpath(".//table[@class='em-compare-alts ']/tr")
        assert len(a_list) == len(a_text_list)
        chosen = None
        for a_idx, el in enumerate(a_list):
            if el.xpath(".//acronym"):
                assert not chosen
                chosen = a_idx
        if chosen == None:
            continue

        comm_el = q_el.xpath(".//div[@class='em-comment']")
        if comm_el:
            assert len(comm_el) == 1
            comm_el = comm_el[0]
            text_list = []
            for br in comm_el.xpath(".//br"):
                if not br.tail:
                    continue
                s = br.tail.strip()
                if s:
                    text_list.append(s)
            comm_text = '\n'.join(text_list)
            assert comm_text[0] == '"' and comm_text[-1] == '"'
            comm_text = comm_text[1:-1]
        else:
            comm_text = None

        opt = q_obj.opt_dict[chosen]
        try:
            ans = Answer.objects.get(member=mp, question=q_obj)
        except:
            ans = Answer(member=mp, question=q_obj)
        ans.option = opt
        ans.explanation = comm_text
        ans.save()
Exemplo n.º 15
0
def process_mops(party_list, update=False, db_insert=False):
    s = http_cache.open_url(url_base + mp_list_url, 'member')
    BAD_HTML = '<! hx4600.thw>'
    idx = s.find(BAD_HTML)
    if idx >= 0:
        s = s[idx + len(BAD_HTML) + 1:]
    parser = mop_list_parser.Parser()
    parser.feed(s)
    parser.close()
    mop_list = parser.get_mop_list()

    parser = mop_info_parser.Parser()
    for mp in mop_list:
        print '%3d: %s, %s' % (mop_list.index(mp), mp['surname'],
                               mp['firstnames'])
        s = http_cache.open_url(url_base + mp['link'], 'member')
        parser.reset(is_lame_frame=True)
        parser.feed(s)
        parser.close()
        mp.update(parser.get_desc())

        print '%3d: person number %s' % (mop_list.index(mp), mp['hnro'])

        try:
            member = Member.objects.get(pk=mp['hnro'])
        except Member.DoesNotExist:
            member = None

        if member and not update:
            continue

        s = http_cache.open_url(url_base + heti_url % mp['hnro'],
                                'member')
        parser.reset(is_lame_frame=False)
        parser.feed(s)
        parser.close()
        mp.update(parser.get_desc())

        photo_url = url_base + mp['photo']

        ext = os.path.splitext(mp['photo'])[-1]

        fname = slugify(mp['name'])
        mp['photo'] = fname + ext
        photo_fname = static_path + mp_photo_path + mp['photo']
        create_path_for_file(photo_fname)
        if not os.path.exists(photo_fname):
            print 'Fetching photo ' + photo_url
            s = http_cache.open_url(photo_url, 'member')
            f = open(photo_fname, 'wb')
            f.write(s)
            f.close()
        else:
            print 'Skipping photo ' + mp['photo']

        party_name = None
        if 'party' in mp:
            party_name = find_party(party_list, mp['party'])
            if not party_name:
                raise Exception('Unknown party')

        for assoc in mp['assoc']:
            if 'end' not in assoc:
                end = None
            else:
                end = assoc['end']
            party = find_party(party_list, assoc['name'])
            if party == None:
                if not end:
                    print assoc
                    raise Exception('party not found')
                    # FIXME: Maybe add the party?
                assoc['name'] = None
            else:
                assoc['name'] = party

        # Find last party association
        last_assoc = sorted(mp['assoc'], key=operator.itemgetter('start'))[-1]
        if 'end' in last_assoc:
            if party_name:
                raise Exception('party set for inactive MP')
            party_name = last_assoc['name']

        if not db_insert:
            continue

        if not member:
            member = Member()
            member.id = mp['hnro']
        member.name = mp['name']
        member.party_id = party_name
        member.photo = mp_photo_path + mp['photo']
        member.info_link = url_base + heti_url % mp['hnro']
        member.birth_date = mp['birthdate']
        member.given_names = mp['firstnames']
        member.surname = mp['surname']
        if 'phone' in mp:
            member.phone = mp['phone']
        if 'email' in mp:
            member.email = mp['email']
        member.save()

        PartyAssociation.objects.filter(member=member).delete()
        for assoc in mp['assoc']:
            if not assoc['name']:
                continue
            if 'end' not in assoc:
                end = None
            else:
                end = assoc['end']
            if assoc['name'] == 'vr':
                assoc['name'] = 'vas'
            party = Party.objects.get(name=assoc['name'])
            pa = PartyAssociation()
            pa.member = member
            pa.party_id = party.pk
            pa.begin = assoc['start']
            pa.end = end
            pa.save()
        DistrictAssociation.objects.filter(member=member).delete()
        for assoc in mp['district']:
            if 'end' not in assoc:
                end = None
            else:
                end = assoc['end']
            da = DistrictAssociation()
            da.member = member
            da.name = assoc['name']
            da.begin = assoc['start']
            da.end = end
            da.save()

    return mop_list
Exemplo n.º 16
0
def download_he(info, doc):
    assert doc
    p_info = download_processing_info(doc)
    doc.date = p_info['date']
    doc.subject = p_info['subject']
    doc.save()
    logger.info('%s: %s' % (doc, doc.subject))

    m = re.match('(\d+)/(\d{4})', info['id'])
    number, year = map(int, m.groups())
    url = HE_URL % (number, year)
    s = http_cache.open_url(url, 'docs', error_ok=True)
    if len(s) > 2*1024*1024:
        logger.warning('response too big (%d bytes)' % len(s))
        return doc
    if not s:
        (s, url) = http_cache.open_url(info['doc_link'], 'docs', return_url=True)
        if '<!-- akxereiloydy.thw -->' in s or '<!-- akx5000.thw -->' in s:
            print "\tNot found!"
            return doc
        html_doc = html.fromstring(s)
        frames = html_doc.xpath(".//frame")
        link_elem = None
        for f in frames:
            if f.attrib['src'].startswith('temp/'):
                link_elem = f
                break
        html_doc.make_links_absolute(url)
        url = link_elem.attrib['src']
        print "\tGenerated and found!"
        s = http_cache.open_url(url, 'docs')
    # First check if's not a valid HE doc, the surest way to
    # detect it appears to be the length. *sigh*
    if len(s) < 1500:
        print "\tJust PDF"
        return doc
    html_doc = html.fromstring(s)
    elem_list = html_doc.xpath(".//p[@class='Normaali']")

    ELEM_CL = ['LLEsityksenPaaSis',
               'LLEsityksenp-00e4-00e4asiallinensis-00e4lt-00f6',
               'LLVSEsityksenp-00e4-00e4asiallinensis-00e4lt-00f6',
               'LLPaaotsikko']
    for cl in ELEM_CL:
        elem = html_doc.xpath(".//p[@class='%s']" % cl)
        if elem:
            break
    if not elem:
        print "\tNo header found: %d" % len(s)
        print http_cache.get_fname(url, 'docs')
        return doc
    # Choose the first header. Sometimes they are replicated. *sigh*
    elem = elem[0].getnext()
    p_list = []
    if 'class' in elem.attrib and elem.attrib['class'] == 'LLNormaali' and \
            elem.getnext().attrib['class'] == 'LLKappalejako':
        elem = elem.getnext()
    while elem is not None:
        if elem.tag != 'p':
            print elem.tag
            break
        OK_CLASS = ('LLKappalejako', 'LLJohtolauseKappaleet',
                    'LLVoimaantulokappale',
                    'LLKappalejako-0020Char-0020Char-0020Char', # WTF
        )
        if not 'class' in elem.attrib or elem.attrib['class'] not in OK_CLASS:
            break
        p_list.append(elem)
        elem = elem.getnext()
    BREAK_CLASS = ('LLNormaali', 'LLYleisperustelut', 'LLPerustelut',
                   'LLNormaali-0020Char', 'Normaali', 'LLSisallysluettelo')
    if 'class' in elem.attrib and elem.attrib['class'] not in BREAK_CLASS:
        print "\tMystery class: %s" % elem.attrib
        print http_cache.get_fname(url, 'docs')
        return doc
    if not p_list:
        print "\tNo summary found"
        print http_cache.get_fname(url, 'docs')
        return doc

    text_list = []
    def append_text(elem, no_append=False):
        text = ''
        if elem.text:
            text = elem.text.replace('\r', '').replace('\n', '').strip()
            text = text.replace('&nbsp;', '')
        if elem.getchildren():
            for ch in elem.getchildren():
                text += append_text(ch, no_append=True)
        if len(text) < 15 and u'\u2014' in text:
            return
        if no_append:
            return text
        text = text.strip()
        if text:
            text_list.append(text)
    for p in p_list:
        append_text(p)
    doc.summary = '\n'.join(text_list)
    attach_keywords(doc, p_info['keywords'])
    if 'docs' in info:
        download_related_docs(doc, info['docs'])
    doc.save()

    return doc
Exemplo n.º 17
0
def download_doc(info, doc):
    logger.info("downloading %s %s" % (info['type'], info['id']))

    if not doc:
        assert not Document.objects.filter(type=info['type'], name=info['id'])
        doc = Document(type=info['type'], name=info['id'])

    url = DOC_DL_URL % (info['type'], info['id'])
    doc.info_link = url

    if not should_download_doc(info):
        logger.warning("skipping %s %s" % (info['type'], info['id']))
        doc.save()
        return doc
    if info['type'] == 'HE':
        return download_he(info, doc)
    if info['type'] == 'VNT':
        p_info = download_processing_info(doc)
        doc.date = p_info['date']
        doc.subject = p_info['subject']
        doc.save()
        attach_keywords(doc, p_info['keywords'])
        return doc

    s = http_cache.open_url(url, 'docs')
    html_doc = html.fromstring(s)
    html_doc.make_links_absolute(url)
    el_list = html_doc.xpath(".//a[contains(., 'Rakenteinen asiakirja')]")
    assert el_list and len(el_list) == 1

    sgml_url = el_list[0].attrib['href']

    s = http_cache.open_url(sgml_url, 'docs')
    f = open("/tmp/%s%s.xml" % (info['type'], info['id'].replace('/', '-')), "w")
    f.write(s)
    f.close()

    sgml_doc = html.fromstring(s)

    el_list = sgml_doc.xpath('.//ident/nimike')
    assert len(el_list) >= 1
    el = el_list[0]
    text = clean_string(el.text)
    logger.info('%s: %s' % (doc, text))
    doc.subject = text

    if doc.type.endswith('VM'):
        el_name_list = ('asianvir', 'emasianv')
    else:
        el_name_list = ('peruste', 'paasis', 'yleisper')
    for el_name in el_name_list:
        summ_el_list = sgml_doc.xpath('.//%s' % el_name)
        if not len(summ_el_list):
            continue
        assert len(summ_el_list) == 1
        break
    p_list = summ_el_list[0].xpath('//te')
    summary = []
    for p_el in p_list:
        text = clean_string(p_el.text_content())
        summary.append(text)
    doc.summary = '\n'.join(summary)
    doc.save()
    process_doc_signatures(doc, sgml_doc)

    # no processing info for committee reports
    if not doc.type.endswith('VM'):
        p_info = download_processing_info(doc)
        attach_keywords(doc, p_info['keywords'])
    if 'docs' in info:
        download_related_docs(doc, info['docs'])

    return doc
Exemplo n.º 18
0
def process_session_votes(url, pl_sess_name):
    parser = vote_list_parser.Parser()
    s = http_cache.open_url(url, 'votes')
    parser.reset()
    parser.feed(s)
    parser.close()
    votes = parser.get_votes()
    desc = parser.get_desc()

    desc['nr'] = int(desc['nr'])
    desc['pl_session'] = pl_sess_name

    if pl_sess_name in pl_sess_list:
        pl_sess = pl_sess_list[pl_sess_name]
    else:
        try:
            pl_sess = PlenarySession.objects.get(name=pl_sess_name)
        except PlenarySession.DoesNotExist:
            pl_sess = PlenarySession(name=pl_sess_name)
            pl_sess_list[pl_sess_name] = pl_sess
        pl_sess.date = desc['date']
        pl_sess.term = Term.objects.get_for_date(pl_sess.date)
        pl_sess.info_link = url_base + desc['session_link']
        pl_sess.save()

    try:
        sess = Session.objects.get(plenary_session=pl_sess, number=desc['nr'])
    except Session.DoesNotExist:
        sess = Session(plenary_session=pl_sess, number=desc['nr'])
    sess.time = desc['time']
    sess.info = '\n'.join(desc['info'])
    sess.subject = desc['subject']
    sess.info_link = None
    sess.save()

    sess.docs.clear()
    sess.keywords.clear()
    for idx, doc_info in enumerate(desc['docs']):
        doc = Document.objects.filter(type=doc_info['type'], name=doc_info['id'])
        if not doc:
            doc = download_doc(doc_info, None)
        else:
            doc = doc[0]
        sd = SessionDocument(session=sess, doc=doc, order=idx)
        sd.save()
        for kw in doc.keywords.all():
            sess.keywords.add(kw)

    sess.vote_set.all().delete()
    for v in votes:
        vote = Vote()
        vote.session = sess
        vote.member_name = v[0]
        vote.party = v[1]
        vote.vote = v[2]
        if not vote.member_name in mem_name_list:
            member = Member.objects.get(name=vote.member_name)
            mem_name_list[vote.member_name] = member
        vote.member = mem_name_list[vote.member_name]
        vote.save()

    sess.count_votes()
    sess.save()

    db.reset_queries()

    return sess