def fetch_list(klass, cache=False, sync=False): for a, url in enumerate(('http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=y', 'http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=n')): soup, suppe = scraper.get_with_nl(url, "annual repports %i" % a) for i, j in zip(soup.find('div', id="story")('table')[1].tbody('tr', recursive=False)[::5], suppe.find('div', id="story")('table')[1].tbody('tr', recursive=False)[::5]): get_or_create(AnnualReport, title={"fr": i('td')[2].text, "nl": j('td')[2].text}, date=i('td')[0].text, law_and_article={"fr": i('td')[4].text, "nl": j('td')[4].text}, periodicity=re.sub("[^0-9]", "", i('td')[5].text), pdf_url=i('td')[1].a["href"] if i('td')[1].a else "", )
def fetch_list(klass, cache=False, sync=False): for document_page in scraper.get("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}): soup, suppe = scraper.get_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text) for soup, suppe in zip(soup.table('tr'), suppe.table('tr')): get_or_create(Document, _id="lachambre_id", title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=soup.div.text, url=soup.a["href"]) # list otherwise mongodb will timeout if we stay in a query mode for document in list(Document.objects.filter(done=False)): if document.lachambre_id == 25: continue try: klass.fetch_one(document) except Exception, e: traceback.print_exc(file=sys.stdout) logger.error("/!\ %s didn't succed! Error: while reparsing document %s" % (document.lachambre_id, e))
def fetch_one(klass, deputy, cache=False, sync=False): soup, suppe = scraper.get_with_nl(LACHAMBRE_PREFIX + deputy.url, deputy.full_name, cache=cache, sync=sync) deputy.photo_uri = "http://www.lachambre.be" + soup.table.img["src"] # XXX can't get this anymore I guess :( # deputy.language = soup.table.i.parent.text.split(":")[1] if soup.i else None deputy.cv["fr"] = re.sub(' +', ' ', soup('table')[1].p.text).strip() deputy.cv["nl"] = re.sub(' +', ' ', suppe('table')[1].p.text).strip() if deputy.cv["fr"].encode("Utf-8").startswith("Députée"): deputy.sex = "F" elif deputy.cv["fr"].encode("Utf-8").startswith("Député"): deputy.sex = "M" else: deputy.sex = None Deputy.split_deputy_full_name(deputy, soup) deputy.save()
def fetch_one(klass, commission, cache=False, sync=False): soup, suppe = scraper.get_with_nl(LACHAMBRE_PREFIX + commission.url, "commission %s" % commission.lachambre_id) commission.full_name["fr"] = soup.h1.text commission.full_name["nl"] = suppe.h1.text commission.deputies = [] seats = {"fr": {}, "nl": {}} for i, j in zip(soup('p')[2:], suppe('p')[2:]): role = i.b.text[:-1] role_nl = j.b.text[:-1] for dep in i('a'): deputy = Deputy.objects.get(lachambre_id=re.search("key=([O0-9]+)", dep["href"]).groups()[0]) membership = get_or_create(CommissionMembership, deputy=deputy, commission=commission) membership.role = role membership.save() commission.deputies.append(membership.id) seats["fr"][role] = map(lambda x: (x[0], len(x[1].split(','))), zip(map(lambda x: x.text[:-1], i('b')[1:]), str(i).split("<br />")[1:])) seats["nl"][role_nl] = map(lambda x: (x[0], len(x[1].split(','))), zip(map(lambda x: x.text[:-1], i('b')[1:]), str(i).split("<br />")[1:])) commission.seats = seats commission.save()
def fetch_list(klass, cache=False, sync=False): soup, suppe = scraper.get_with_nl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/comm/commissions&language=fr&cfm=/site/wwwcfm/comm/LstCom.cfm&rightmenu=right_cricra", "commissions list") _type = "" for i, j in zip(soup("div", id="story")[1], suppe("div", id="story")[1]): if not isinstance(i, NavigableString) and (i.h4 or i.a): if i.h4: _type = i.h4.text _type_nl = j.h4.text elif i.a: commission = get_or_create(Commission, lachambre_id=int(re.search("com=(\d+)", i.a["href"]).groups()[0])) commission.type["fr"] = _type commission.type["nl"] = _type_nl commission.name["fr"] = i.a.text commission.name["nl"] = j.a.text commission.url = i.a["href"] commission.save() for com in list(Commission.objects.all()): klass.fetch_one(com, cache=cache, sync=sync)
def fetch_one(klass, link, cache=False, sync=False): soupsoup, suppesuppe = scraper.get_with_nl(LACHAMBRE_PREFIX + link.a["href"], "written question %s" % re.search(DOSSIER_ID_REGEX, link.a["href"]).groups()[0]) data = AccessControlDict(((x.td.text.strip(), x('td')[1]) for x in soupsoup.find('table', 'txt')('tr') if x.td.text)) data_nl = AccessControlDict(((x.td.text.strip(), x('td')[1]) for x in suppesuppe.find('table', 'txt')('tr') if x.td.text)) print sorted(data.keys()) print sorted(data_nl.keys()) update_or_create(WrittenQuestion, _id="lachambre_id", lachambre_id=re.search(DOSSIER_ID_REGEX, link.a["href"]).groups()[0], title={"fr": data["Titre"].text, "nl": data_nl["Titel"].text}, departement={"fr": data[u"D\xe9partement"].text, "nl": data_nl[u"Departement"].text}, sub_departement={"fr": data[u"Sous-d\xe9partement"].text, "nl": data_nl[u"Sub-departement"].text}, deposition_date=data[u"Date de d\xe9p\xf4t"].text, delay_date=dico_get_text(data, u"Date de d\xe9lai"), publication_date=dico_get_text(data, "Date publication"), # TODO: link to the actual deputy author=data[u"Auteur"].text, language=data[u"Langue"].text, question_status={"fr": dico_get_text(data, "Statut question"), "nl": dico_get_text(data_nl, "Status vraag")}, status={"fr": dico_get_text(data, "Statut"), "nl": dico_get_text(data_nl, "Status")}, question={"fr": u"%s" % data["Question"], "nl": "%s" % data_nl["Vraag"]}, answer={"fr": dico_get_text(data, u"R\xe9ponse"), "nl": dico_get_text(data_nl, u"Antwoord")}, publication_reponse_pdf_url=get_href_else_blank(data, u"Publication r\xe9ponse"), publication_question_pdf_url=get_href_else_blank(data, u"Publication question"), publication_reponse=get_text_else_blank(data, u"Publication r\xe9ponse"), publication_question=get_text_else_blank(data, u"Publication question"), eurovoc_descriptors={"fr": get_items_list_else_empty_list(data, "Descripteurs Eurovoc"), "nl": get_items_list_else_empty_list(data_nl, "Eurovoc-descriptoren")}, eurovoc_principal_descriptors={"fr": get_items_list_else_empty_list(data, "Desc. Eurovoc principal"), "nl": get_items_list_else_empty_list(data_nl, "Eurovoc-hoofddescriptor")}, eurovoc_candidats_descriptors={"fr": get_items_list_else_empty_list(data, "Candidats-descripteurs Eurovoc"), "nl": get_items_list_else_empty_list(data_nl, "Eurovoc kandidaat-descriptoren")}, keywords={"fr": get_items_list_else_empty_list(data, u"Mots-cl\xe9s libres"), "nl": get_items_list_else_empty_list(data_nl, u"Vrije trefwoorden")}, url=link.a["href"], ) data.die_if_got_not_accessed_keys()