Exemplo n.º 1
0
def _save_a_written_question(link):
    soupsoup, suppesuppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + link.a["href"], "written question %s" % re.search("dossierID=([0-9A-Z-]+).xml", link.a["href"]).groups()[0])
    data = AccessControlDict(((x.td.text, x('td')[1]) for x in soupsoup.find('table', 'txt')('tr') if x.td.text))
    data_nl = AccessControlDict(((x.td.text, x('td')[1]) for x in suppesuppe.find('table', 'txt')('tr') if x.td.text))
    get_or_create(WrittenQuestion,
                  _id="lachambre_id",
                  lachambre_id=re.search("dossierID=([0-9A-Z-]+).xml", link.a["href"]).groups()[0],
                  title={"fr": data["Titre"].text, "nl": data_nl["Titel"].text},
                  departement={"fr": data[u"Département"].text, "nl": data_nl[u"Departement"].text},
                  sub_departement={"fr": data[u"Sous-département"].text, "nl": data_nl[u"Sub-departement"].text},
                  deposition_date=data[u"Date de dépôt"].text,
                  delay_date=dico_get_text(data, u"Date de délai"),
                  publication_date=dico_get_text(data, "Date publication"),
                  # TODO: link to the actual deputy
                  author=data[u"Auteur"].text,
                  language=data[u"Langue"].text,
                  question_status={"fr": dico_get_text(data, "Statut question"), "nl": dico_get_text(data_nl, "Status vraag")},
                  status={"fr": dico_get_text(data, "Statut"), "nl": dico_get_text(data_nl, "Status")},
                  question={"fr": u"%s" % data["Question"], "nl": "%s" % data_nl["Vraag"]},
                  answer={"fr": dico_get_text(data, u"Réponse"), "nl": dico_get_text(data_nl, u"Antwoord")},
                  publication_reponse_pdf_url=get_href_else_blank(data, u"Publication réponse"),
                  publication_question_pdf_url=get_href_else_blank(data, u"Publication question"),
                  publication_reponse=get_text_else_blank(data, u"Publication réponse"),
                  publication_question=get_text_else_blank(data, u"Publication question"),
                  eurovoc_descriptors={"fr": get_items_list_else_empty_list(data, "Descripteurs Eurovoc"),
                                       "nl": get_items_list_else_empty_list(data_nl, "Eurovoc-descriptoren")},
                  eurovoc_candidats_descriptors={"fr": get_items_list_else_empty_list(data, "Candidats-descripteurs Eurovoc"),
                                                 "nl": get_items_list_else_empty_list(data_nl, "Eurovoc kandidaat-descriptoren")},
                  keywords={"fr": get_items_list_else_empty_list(data, u"Mots-clés libres"),
                            "nl": get_items_list_else_empty_list(data_nl, u"Vrije trefwoorden")},
                  url=link.a["href"],
                 )

    data.die_if_got_not_accessed_keys()
Exemplo n.º 2
0
def scrape():
    soup, suppe = read_or_dl_with_nl(
        "http://www.lachambre.be/kvvcr/showpage.cfm?section=/comm/commissions&language=fr&cfm=/site/wwwcfm/comm/LstCom.cfm&rightmenu=right_cricra",
        "commissions list")
    _type = ""
    for i, j in zip(soup("div", id="story")[1], suppe("div", id="story")[1]):
        if not isinstance(i, NavigableString) and (i.h4 or i.a):
            if i.h4:
                _type = i.h4.text
                _type_nl = j.h4.text
            elif i.a:
                commission = get_or_create(Commission,
                                           lachambre_id=int(
                                               re.search(
                                                   "com=(\d+)",
                                                   i.a["href"]).groups()[0]))
                commission.type["fr"] = _type
                commission.type["nl"] = _type_nl
                commission.name["fr"] = i.a.text
                commission.name["nl"] = j.a.text
                commission.url = i.a["href"]

                commission.save()

    for com in list(Commission.objects.all()):
        handle_commission(com)
Exemplo n.º 3
0
def handle_commission(commission):
    soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + commission.url,
                                     "commission %s" % commission.lachambre_id)
    commission.full_name["fr"] = soup.h1.text
    commission.full_name["nl"] = suppe.h1.text
    commission.deputies = []
    seats = {"fr": {}, "nl": {}}
    for i, j in zip(soup('p'), suppe('p')):
        role = i.b.text[:-1]
        role_nl = j.b.text[:-1]
        for dep in i('a'):
            deputy = Deputy.objects.get(lachambre_id=re.search(
                "key=([O0-9]+)", dep["href"]).groups()[0])
            membership = get_or_create(CommissionMembership,
                                       deputy=deputy,
                                       commission=commission)
            membership.role = role
            membership.save()
            commission.deputies.append(membership.id)
        seats["fr"][role] = map(
            lambda x: (x[0], len(x[1].split(','))),
            zip(map(lambda x: x.text[:-1],
                    i('b')[1:]),
                str(i).split("<br />")[1:]))
        seats["nl"][role_nl] = map(
            lambda x: (x[0], len(x[1].split(','))),
            zip(map(lambda x: x.text[:-1],
                    i('b')[1:]),
                str(i).split("<br />")[1:]))

    commission.seats = seats
    commission.save()
Exemplo n.º 4
0
def scrape():
    for a, url in enumerate(
        ('http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=y',
         'http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=n'
         )):
        soup, suppe = read_or_dl_with_nl(url, "annual repports %i" % a)

        for i, j in zip(
                soup.find('div', id="story")('table')[1]('tr',
                                                         recursive=False)[::5],
                suppe.find('div',
                           id="story")('table')[1]('tr',
                                                   recursive=False)[::5]):
            get_or_create(
                AnnualReport,
                title={
                    "fr": i('td')[2].text,
                    "nl": j('td')[2].text
                },
                date=i('td')[0].text,
                law_and_article={
                    "fr": i('td')[4].text,
                    "nl": j('td')[4].text
                },
                periodicity=re.sub("[^0-9]", "",
                                   i('td')[5].text),
                pdf_url=i('td')[1].a["href"] if i('td')[1].a else "",
            )
Exemplo n.º 5
0
def check_for_new_documents():
    for document_page in read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}):
        soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text)
        for soup, suppe in zip(soup('table')[4]('tr', valign="top"), suppe('table')[4]('tr', valign="top")):
            if not Document.objects.filter(lachambre_id=soup.div.text):
                url = soup.a["href"]
                title = soup("div")[1].text
                lachambre_id = soup.div.text
                logger.info("find a new document: %s - [%s] -  %s" % (LACHAMBRE_PREFIX + url, lachambre_id, title))
                document = Document(title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=lachambre_id, url=soup.a["href"])
                handle_document(document)
Exemplo n.º 6
0
def check_for_new_documents():
    for document_page in read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}):
        soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text)
        for soup, suppe in zip(soup('table')[4]('tr', valign="top"), suppe('table')[4]('tr', valign="top")):
            if not Document.objects.filter(lachambre_id=soup.div.text):
                url = soup.a["href"]
                title = soup("div")[1].text
                lachambre_id = soup.div.text
                logger.info("find a new document: %s - [%s] -  %s" % (LACHAMBRE_PREFIX + url, lachambre_id, title))
                document = Document(title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=lachambre_id, url=soup.a["href"])
                handle_document(document)
Exemplo n.º 7
0
def scrape():
    for a, url in enumerate(('http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=y',
                         'http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=n')):
        soup, suppe = read_or_dl_with_nl(url, "annual repports %i" % a)

        for i, j in zip(soup.find('div', id="story")('table')[1].tbody('tr', recursive=False)[::5], suppe.find('div', id="story")('table')[1].tbody('tr', recursive=False)[::5]):
            get_or_create(AnnualReport,
                          title={"fr": i('td')[2].text, "nl": j('td')[2].text},
                          date=i('td')[0].text,
                          law_and_article={"fr": i('td')[4].text, "nl": j('td')[4].text},
                          periodicity=re.sub("[^0-9]", "", i('td')[5].text),
                          pdf_url=i('td')[1].a["href"] if i('td')[1].a else "",
                          )
Exemplo n.º 8
0
def _handle_deputy(deputy, reset=False):
    soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + deputy.url, deputy.full_name, reset)
    deputy.language = soup.i.parent.text.split(":")[1] if soup.i else None
    deputy.cv["fr"] = re.sub('  +', ' ', soup('table')[5].p.text)
    deputy.cv["nl"] = re.sub('  +', ' ', suppe('table')[5].p.text)
    if deputy.cv["fr"].encode("Utf-8").startswith("Députée"):
        deputy.sex = "F"
    elif deputy.cv["fr"].encode("Utf-8").startswith("Député"):
        deputy.sex = "M"
    else:
        deputy.sex = None

    _split_deputy_full_name(deputy, soup)
    #_get_deputie_commissions(soup, deputy)
    #_deputy_documents(soup, deputy)
    deputy.save()
Exemplo n.º 9
0
def _handle_deputy(deputy, reset=False):
    soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + deputy.url,
                                     deputy.full_name, reset)
    deputy.language = soup.i.parent.text.split(":")[1] if soup.i else None
    deputy.cv["fr"] = re.sub('  +', ' ', soup('table')[5].p.text)
    deputy.cv["nl"] = re.sub('  +', ' ', suppe('table')[5].p.text)
    if deputy.cv["fr"].encode("Utf-8").startswith("Députée"):
        deputy.sex = "F"
    elif deputy.cv["fr"].encode("Utf-8").startswith("Député"):
        deputy.sex = "M"
    else:
        deputy.sex = None

    _split_deputy_full_name(deputy, soup)
    #_get_deputie_commissions(soup, deputy)
    #_deputy_documents(soup, deputy)
    deputy.save()
Exemplo n.º 10
0
def handle_deputy(deputy, reset=False):
    soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + deputy.url, deputy.full_name, reset)
    deputy.photo_uri = "http://www.lachambre.be" + soup.table.img["src"]
    # XXX can't get this anymore I guess :(
    # deputy.language = soup.table.i.parent.text.split(":")[1] if soup.i else None
    deputy.cv["fr"] = re.sub('  +', ' ', soup('table')[1].p.text)
    deputy.cv["nl"] = re.sub('  +', ' ', suppe('table')[1].p.text)
    if deputy.cv["fr"].encode("Utf-8").startswith("Députée"):
        deputy.sex = "F"
    elif deputy.cv["fr"].encode("Utf-8").startswith("Député"):
        deputy.sex = "M"
    else:
        deputy.sex = None

    split_deputy_full_name(deputy, soup)
    # _get_deputie_commissions(soup, deputy)
    # _deputy_documents(soup, deputy)
    deputy.save()
Exemplo n.º 11
0
def handle_commission(commission):
    soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + commission.url, "commission %s" % commission.lachambre_id)
    commission.full_name["fr"] = soup.h1.text
    commission.full_name["nl"] = suppe.h1.text
    commission.deputies = []
    seats = {"fr": {}, "nl": {}}
    for i, j in zip(soup('p'), suppe('p')):
        role = i.b.text[:-1]
        role_nl = j.b.text[:-1]
        for dep in i('a'):
            deputy = Deputy.objects.get(lachambre_id=re.search("key=([O0-9]+)", dep["href"]).groups()[0])
            membership = get_or_create(CommissionMembership, deputy=deputy, commission=commission)
            membership.role = role
            membership.save()
            commission.deputies.append(membership.id)
        seats["fr"][role] = map(lambda x: (x[0], len(x[1].split(','))), zip(map(lambda x: x.text[:-1], i('b')[1:]), str(i).split("<br />")[1:]))
        seats["nl"][role_nl] = map(lambda x: (x[0], len(x[1].split(','))), zip(map(lambda x: x.text[:-1], i('b')[1:]), str(i).split("<br />")[1:]))

    commission.seats = seats
    commission.save()
Exemplo n.º 12
0
def scrape():
    soup, suppe = read_or_dl_with_nl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/comm/commissions&language=fr&cfm=/site/wwwcfm/comm/LstCom.cfm&rightmenu=right_cricra", "commissions list")
    _type = ""
    for i, j in zip(soup("div", id="story")[1], suppe("div", id="story")[1]):
        if not isinstance(i, NavigableString) and (i.h4 or i.a):
            if i.h4:
                _type = i.h4.text
                _type_nl = j.h4.text
            elif i.a:
                commission = get_or_create(Commission, lachambre_id=int(re.search("com=(\d+)", i.a["href"]).groups()[0]))
                commission.type["fr"] = _type
                commission.type["nl"] = _type_nl
                commission.name["fr"] = i.a.text
                commission.name["nl"] = j.a.text
                commission.url = i.a["href"]

                commission.save()

    for com in list(Commission.objects.all()):
        handle_commission(com)
Exemplo n.º 13
0
def get_new_documents():
    for document_page in read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}):
        soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text)
        for soup, suppe in zip(soup('table')[4]('tr', valign="top"), suppe('table')[4]('tr', valign="top")):
            get_or_create(Document, _id="lachambre_id", title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=soup.div.text, url=soup.a["href"])
Exemplo n.º 14
0
def get_new_documents():
    for document_page in read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}):
        soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text)
        for soup, suppe in zip(soup('table')[4]('tr', valign="top"), suppe('table')[4]('tr', valign="top")):
            get_or_create(Document, _id="lachambre_id", title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=soup.div.text, url=soup.a["href"])