def _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre): if chambre_dico.get('Document(s) suivant(s)'): for d, d_nl in zip(document_pdf_part_cutter(chambre_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(chambre_dico_nl[u'Opvolgend(e) document(en)'])): logger.debug("add pdf %s" % clean_text(d[0].font.text)) doc = OtherDocumentChambrePdf() doc.url = d[0].a['href'] if d[0].a else d[0].td.text doc.type["fr"] = clean_text(d[0].font.text) doc.type["nl"] = clean_text(d_nl[0].font.text) doc.distribution_date = d[1]('td')[-1].text for dep, dep_nl in zip(d[2:], d_nl[2:]): if dep.a: lachambre_id = re.search('key=(\d+)', dep.a["href"]).groups()[0] deputy = Deputy.objects.get(lachambre_id=lachambre_id) doc.authors.append({"lachambre_id": deputy.lachambre_id, "id": deputy.id, "full_name": deputy.full_name, "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}}) else: doc.authors.append({"lachambre_id": -1, "id": -1, "full_name": dep('td')[-1].contents[2].strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}}) doc.save() document_chambre.other_pdfs.append(doc)
def _get_document_senat(dico, dico_nl, document): if not dico.get(u"Document Sénat"): return senat_dico = dico[u"Document Sénat"] senat_dico_nl = dico_nl[u"Document Senaat"] document_senat = DocumentSenat() document_senat.deposition_date = senat_dico[u"Date de dépôt"].text document_senat.ending_date = get_text_else_blank(senat_dico, u"Date de fin") document_senat.type["fr"] = senat_dico[u"Type de document"].text document_senat.type["nl"] = senat_dico_nl[u"Document type"].text document_senat.comments["fr"] = get_text_else_blank(senat_dico, u'Commentaire').split(' - ') document_senat.comments["nl"] = get_text_else_blank(senat_dico_nl, u'Commentaar').split(' - ') document_senat.author = clean_text(get_text_else_blank(senat_dico, u"Auteur(s)")) document_senat.status["fr"] = get_text_else_blank(senat_dico, u'Statut') document_senat.status["nl"] = get_text_else_blank(senat_dico_nl, u'Status') url, tipe, session = clean_text(str(senat_dico[u'head']).replace(" ", "")).split("<br />") _, tipe_nl, _ = clean_text(str(senat_dico_nl[u'head']).replace(" ", "")).split("<br />") url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url document_senat.pdf = DocumentSenatPdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2]) if senat_dico.get('Document(s) suivant(s)'): for d, d_nl in zip(document_pdf_part_cutter(senat_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(senat_dico_nl[u'Opvolgend(e) document(en)'])): logger.debug("add pdf %s" % clean_text(d[0].font.text)) doc = OtherDocumentSenatPdf() doc.url = d[0].a['href'] if d[0].a else d[0].td.text doc.type["fr"] = clean_text(d[0].font.text) doc.type["nl"] = clean_text(d_nl[0].font.text) doc.date = d[0]('td')[-1].contents[0] doc.authors = [] for dep, dep_nl in zip(d[1:], d_nl[1:]): doc.authors.append({"full_name": unicode(dep('td')[-1].contents[2]).strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}}) doc.save() document_senat.other_pdfs.append(doc) document_senat.save() document.document_senat = document_senat