def _get_document_chambre(dico, dico_nl, document): if not dico.get("Document Chambre"): return chambre_dico = dico['Document Chambre'] chambre_dico_nl = dico_nl['Document Kamer'] document_chambre = DocumentChambre() document_chambre.deposition_date = get_text_else_blank(chambre_dico, u'Date de dépôt') document_chambre.type["fr"] = chambre_dico[u'Type de document'].text document_chambre.type["nl"] = chambre_dico_nl[u'Document type'].text document_chambre.taken_in_account_date = get_text_else_blank(chambre_dico, u'Prise en considération') document_chambre.distribution_date = get_text_else_blank(chambre_dico, u'Date de distribution') document_chambre.sending_date = get_text_else_blank(chambre_dico, u'Date d\'envoi') document_chambre.ending_date = get_text_else_blank(chambre_dico, u'Date de fin') document_chambre.status["fr"] = get_text_else_blank(chambre_dico, u'Statut') document_chambre.status["nl"] = get_text_else_blank(chambre_dico_nl, u'Status') document_chambre.comments["fr"] = get_text_else_blank(chambre_dico, u'Commentaire').split(' ') document_chambre.comments["nl"] = get_text_else_blank(chambre_dico_nl, u'Commentaar').split(' ') Document._get_authors(chambre_dico, chambre_dico_nl, document_chambre) url, tipe, session = clean_text(str(chambre_dico[u'head']).replace(" ", "")).split("<br />") _, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace(" ", "")).split("<br />") url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url document_chambre.pdf = DocumentChambrePdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2]) Document._get_next_documents(chambre_dico, chambre_dico_nl, document_chambre) if chambre_dico.get(u'Document(s) joint(s)/lié(s)'): document_chambre.joint_pdfs = [{"url": x.a["href"], "title": {"fr": x.contents[0][1:-1], "nl": y.contents[0][1:-1]}} for x, y in zip(chambre_dico[u'Document(s) joint(s)/lié(s)'], chambre_dico_nl[u'Gekoppeld(e)/verbonden document(en)'],)] document_chambre.save() document.document_chambre = document_chambre
def fetch_one(klass, link, cache=False, sync=False): soupsoup, suppesuppe = scraper.get_with_nl(LACHAMBRE_PREFIX + link.a["href"], "written question %s" % re.search(DOSSIER_ID_REGEX, link.a["href"]).groups()[0]) data = AccessControlDict(((x.td.text.strip(), x('td')[1]) for x in soupsoup.find('table', 'txt')('tr') if x.td.text)) data_nl = AccessControlDict(((x.td.text.strip(), x('td')[1]) for x in suppesuppe.find('table', 'txt')('tr') if x.td.text)) print sorted(data.keys()) print sorted(data_nl.keys()) update_or_create(WrittenQuestion, _id="lachambre_id", lachambre_id=re.search(DOSSIER_ID_REGEX, link.a["href"]).groups()[0], title={"fr": data["Titre"].text, "nl": data_nl["Titel"].text}, departement={"fr": data[u"D\xe9partement"].text, "nl": data_nl[u"Departement"].text}, sub_departement={"fr": data[u"Sous-d\xe9partement"].text, "nl": data_nl[u"Sub-departement"].text}, deposition_date=data[u"Date de d\xe9p\xf4t"].text, delay_date=dico_get_text(data, u"Date de d\xe9lai"), publication_date=dico_get_text(data, "Date publication"), # TODO: link to the actual deputy author=data[u"Auteur"].text, language=data[u"Langue"].text, question_status={"fr": dico_get_text(data, "Statut question"), "nl": dico_get_text(data_nl, "Status vraag")}, status={"fr": dico_get_text(data, "Statut"), "nl": dico_get_text(data_nl, "Status")}, question={"fr": u"%s" % data["Question"], "nl": "%s" % data_nl["Vraag"]}, answer={"fr": dico_get_text(data, u"R\xe9ponse"), "nl": dico_get_text(data_nl, u"Antwoord")}, publication_reponse_pdf_url=get_href_else_blank(data, u"Publication r\xe9ponse"), publication_question_pdf_url=get_href_else_blank(data, u"Publication question"), publication_reponse=get_text_else_blank(data, u"Publication r\xe9ponse"), publication_question=get_text_else_blank(data, u"Publication question"), eurovoc_descriptors={"fr": get_items_list_else_empty_list(data, "Descripteurs Eurovoc"), "nl": get_items_list_else_empty_list(data_nl, "Eurovoc-descriptoren")}, eurovoc_principal_descriptors={"fr": get_items_list_else_empty_list(data, "Desc. Eurovoc principal"), "nl": get_items_list_else_empty_list(data_nl, "Eurovoc-hoofddescriptor")}, eurovoc_candidats_descriptors={"fr": get_items_list_else_empty_list(data, "Candidats-descripteurs Eurovoc"), "nl": get_items_list_else_empty_list(data_nl, "Eurovoc kandidaat-descriptoren")}, keywords={"fr": get_items_list_else_empty_list(data, u"Mots-cl\xe9s libres"), "nl": get_items_list_else_empty_list(data_nl, u"Vrije trefwoorden")}, url=link.a["href"], ) data.die_if_got_not_accessed_keys()
def _get_first_level_data(dico, dico_nl, document): document.deposition_date = get_text_else_blank(dico, u"Date de dépôt") document.constitution_article["fr"] = clean_text(get_text_else_blank(dico, "Article Constitution")) document.constitution_article["nl"] = clean_text(get_text_else_blank(dico_nl, "Artikel Grondwet")) if dico.get("Descripteur Eurovoc principal"): document.eurovoc_main_descriptor["fr"] = dico["Descripteur Eurovoc principal"]["head"].text if dico.get("Eurovoc-hoofddescriptor"): document.eurovoc_main_descriptor["nl"] = dico_nl["Eurovoc-hoofddescriptor"]["head"].text document.vote_date = get_text_else_blank(dico, "Vote Chambre") document.law_date = get_text_else_blank(dico, "Date de la loi") document.moniteur_number = get_text_else_blank(dico, u"Moniteur n°") document.moniteur_date = get_text_else_blank(dico, u"Date moniteur") document.vote_senat_date = get_text_else_blank(dico, u"Vote Sénat") document.candidature_vote_date = get_text_else_blank(dico, u"Vote candidature") if dico.get("Etat d'avancement"): document.status_chambre["fr"] = clean_text(dico["Etat d'avancement"].contents[0]) document.status_senat["fr"] = clean_text(dico["Etat d'avancement"].contents[2]) if len(dico["Etat d'avancement"]) >= 3 else None if dico.get("Stand van zaken"): document.status_chambre["nl"] = clean_text(dico_nl["Stand van zaken"].contents[0]) document.status_senat["nl"] = clean_text(dico_nl["Stand van zaken"].contents[2]) if len(dico_nl["Stand van zaken"]) >= 3 else None if dico.get("Descripteurs Eurovoc"): document.eurovoc_descriptors["fr"] = map(lambda x: x.strip(), dico["Descripteurs Eurovoc"]["head"].text.split("|")) if dico.get("Eurovoc descriptoren"): document.eurovoc_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc descriptoren"]["head"].text.split("|")) if dico.get("Candidats-descripteurs Eurovoc"): document.eurovoc_candidats_descriptors["fr"] = map(lambda x: x.strip(), dico["Candidats-descripteurs Eurovoc"]["head"].text.split("|")) if dico.get("Eurovoc kandidaat-descriptoren"): document.eurovoc_candidats_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc kandidaat-descriptoren"]["head"].text.split("|")) if dico.get(u"Mots-clés libres"): document.keywords["fr"] = map(lambda x: x.strip(), dico[u"Mots-clés libres"]["head"].text.split("|")) if dico.get(u"Vrije trefwoorden"): document.keywords["nl"] = map(lambda x: x.strip(), dico_nl[u"Vrije trefwoorden"]["head"].text.split("|")) if dico.get("Documents principaux"): document.main_docs["fr"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico["Documents principaux"].contents)) if dico.get("Hoodfdocumenten"): document.main_docs["nl"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico_nl["Hoodfdocumenten"].contents))
def _get_document_senat(dico, dico_nl, document): if not dico.get(u"Document Sénat"): return senat_dico = dico[u"Document Sénat"] senat_dico_nl = dico_nl[u"Document Senaat"] document_senat = DocumentSenat() document_senat.deposition_date = senat_dico[u"Date de dépôt"].text document_senat.ending_date = get_text_else_blank(senat_dico, u"Date de fin") document_senat.type["fr"] = senat_dico[u"Type de document"].text document_senat.type["nl"] = senat_dico_nl[u"Document type"].text document_senat.comments["fr"] = get_text_else_blank(senat_dico, u'Commentaire').split(' - ') document_senat.comments["nl"] = get_text_else_blank(senat_dico_nl, u'Commentaar').split(' - ') document_senat.author = clean_text(get_text_else_blank(senat_dico, u"Auteur(s)")) document_senat.status["fr"] = get_text_else_blank(senat_dico, u'Statut') document_senat.status["nl"] = get_text_else_blank(senat_dico_nl, u'Status') url, tipe, session = clean_text(str(senat_dico[u'head']).replace(" ", "")).split("<br />") _, tipe_nl, _ = clean_text(str(senat_dico_nl[u'head']).replace(" ", "")).split("<br />") url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url document_senat.pdf = DocumentSenatPdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2]) if senat_dico.get('Document(s) suivant(s)'): for d, d_nl in zip(document_pdf_part_cutter(senat_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(senat_dico_nl[u'Opvolgend(e) document(en)'])): logger.debug("add pdf %s" % clean_text(d[0].font.text)) doc = OtherDocumentSenatPdf() doc.url = d[0].a['href'] if d[0].a else d[0].td.text doc.type["fr"] = clean_text(d[0].font.text) doc.type["nl"] = clean_text(d_nl[0].font.text) doc.date = d[0]('td')[-1].contents[0] doc.authors = [] for dep, dep_nl in zip(d[1:], d_nl[1:]): doc.authors.append({"full_name": unicode(dep('td')[-1].contents[2]).strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}}) doc.save() document_senat.other_pdfs.append(doc) document_senat.save() document.document_senat = document_senat