def add_metrics_via_adhoc_parsing(dos, log=sys.stderr): senat_dos = download_senat(dos['URL du dossier'], log=log) if not senat_dos: print(' /!\ INVALID SENAT DOS') return # Add AN version if there's one parsed_dos = senat_dos if 'url_dossier_assemblee' in senat_dos: dos['URL du dossier Assemblée'] = senat_dos.get('url_dossier_assemblee') an_dos = download_an(senat_dos['url_dossier_assemblee'], senat_dos['url_dossier_senat'], log=log) if 'url_dossier_senat' in an_dos and are_same_doslegs(senat_dos, an_dos): parsed_dos = merge_senat_with_an(senat_dos, an_dos) dos['Titre court'] = parsed_dos['short_title'] dos['Type de procédure'] = "accélérée" if parsed_dos['urgence'] else "normale" dos['Initiative du texte'], dos['Institution de dépôt'] = get_initiative(parsed_dos['steps']) dos['Étapes échouées'] = count_echecs(parsed_dos['steps']) dos['CMP'] = get_CMP_type(parsed_dos['steps']) cc_step = [step['source_url'] for step in parsed_dos['steps'] if step.get('stage') == 'constitutionnalité'] # dos['Taille de la décision du CC'] = get_decision_length(cc_step[0]) if cc_step else '' dos['URL CC'] = cc_step[0] if cc_step else '' # dos['Signataires au JO'] = count_signataires(parsed_dos['url_jo']) if 'url_jo' in parsed_dos else '' dos['URL JO'] = parsed_dos['url_jo'] if 'url_jo' in parsed_dos else '' dos["Législature de promulgation"] = parsed_dos.get('assemblee_legislature', '') last_depot = find_last_depot(parsed_dos['steps']) last_text = None for step in reversed(parsed_dos['steps']): last_text = step if step.get('step') == 'hemicycle': break if step.get('step') == 'commission': raise Exception('commission as last step') if 'source_url' in last_text: try: articles = parse_texte.parse(last_text['source_url']) if articles and articles[0].get('definitif'): dos['Nombre de caractères final'] = read_text(parse_texte.parse(last_text['source_url'])) else: dos['Nombre de caractères final'] = get_texte_length(parsed_dos['url_jo']) if 'url_jo' in parsed_dos else '' except: print("WARNING: Nombre de caractères final impossible to evaluate") try: input_text_length = read_text(parse_texte.parse(last_depot['source_url'])) if input_text_length > 0: dos['Nombre de caractères initial'] = input_text_length except: print("WARNING: Nombre de caractères initial impossible to evaluate")
def parse_first_working_url(urls: List[str]) -> List[dict]: for url in urls: try: articles: List[dict] = parse(url, include_annexes=True) except TextParsingFailedException: logger.exception("Scraping of URL %s failed during text parsing", url) articles = [] except Exception: logger.exception("Scraping of URL %s failed in an unexpected way", url) articles = [] if len(articles) > 1: return articles raise NotFound
print('> testing url fixing') # AN .pdf assert find_good_url_resp('http://www.assemblee-nationale.fr/13/pdf/pion1895.pdf').url == 'http://www.assemblee-nationale.fr/13/propositions/pion1895.asp' # senat simple assert find_good_url_resp('https://www.senat.fr/leg/tas11-040.html').url == 'https://www.senat.fr/leg/tas11-040.html' # senat multi-page but not last page assert find_good_url_resp('https://www.senat.fr/rap/l07-485/l07-485.html').url == 'https://www.senat.fr/rap/l07-485/l07-4851.html' # senat multi-page but not mono assert find_good_url_resp('http://www.senat.fr/rap/l09-654/l09-654.html').url == 'http://www.senat.fr/rap/l09-654/l09-6542.html' # senat multi-page text assert find_good_url_resp('https://www.senat.fr/rap/l08-584/l08-584.html').url == 'https://www.senat.fr/rap/l08-584/l08-584_mono.html' # senat multipage examen en commission assert find_good_url_resp('https://www.senat.fr/rap/l09-535/l09-535.html').url == 'https://www.senat.fr/rap/l09-535/l09-5358.html' print(' > OK ') print('> testing parse_texte') assert len(parse_texte.parse('http://www.assemblee-nationale.fr/13/rapports/r2568.asp')) == 5 assert len(parse_texte.parse('https://www.senat.fr/leg/ppl08-039.html')) == 2 # do not catch annexe as part of last article in legifrance texts assert len(parse_texte.parse('https://www.legifrance.gouv.fr/affichTexte.do?cidTexte=JORFTEXT000025005833' '&categorieLien=id#JORFSCTA000025005850')[-1]['alineas']) == 2 print(' > OK') if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) with log_print(only_log=True): download_groupes.process(OUTPUT_DIR) download_lois_dites.process(OUTPUT_DIR) opendata_an = download_AN_opendata.process(OUTPUT_DIR)
Usage: python tests_parse_annexes.py """ import sys from tlfp.tools import parse_texte if "--enable-cache" in sys.argv: from lawfactory_utils.urls import enable_requests_cache enable_requests_cache() print("> testing parse_texte without annexes (default)") result = parse_texte.parse( "http://www.assemblee-nationale.fr/15/ta-commission/r1056-a0.asp" ) article = result[-1] assert article["type"] == "article" print(" > OK") print("> testing parse_texte annexes from AN") result = parse_texte.parse( "http://www.assemblee-nationale.fr/15/ta-commission/r1056-a0.asp", include_annexes=True, ) assert len(result) == 94, len(result) annexe = result[-1] assert annexe["type"] == "annexe", annexe["type"] assert annexe["order"] == 82, annexe["order"] assert annexe["statut"] == "none", annexe["statut"]
""" Test annexes retrieval Usage: python tests_parse_annexes.py """ import sys from tlfp.tools import parse_texte if "--enable-cache" in sys.argv: from lawfactory_utils.urls import enable_requests_cache enable_requests_cache() print("> testing parse_texte without annexes (default)") result = parse_texte.parse( "http://www.assemblee-nationale.fr/15/ta-commission/r1056-a0.asp") article = result[-1] assert article["type"] == "article" print(" > OK") print("> testing parse_texte annexes from AN") result = parse_texte.parse( "http://www.assemblee-nationale.fr/15/ta-commission/r1056-a0.asp", include_annexes=True, ) assert len(result) == 94, len(result) annexe = result[-1] assert annexe["type"] == "annexe", annexe["type"] assert annexe["order"] == 82, annexe["order"] assert annexe["statut"] == "none", annexe["statut"] assert (
Test PLF 2nd part for year 2018 retrieval Usage: python tests_parse_texte_plf.py """ import sys from tlfp.tools import parse_texte if "--enable-cache" in sys.argv: from lawfactory_utils.urls import enable_requests_cache enable_requests_cache() print("> testing parse_texte.parse for PLF 2 (2018)") result = parse_texte.parse("http://www.assemblee-nationale.fr/15/projets/pl0235.asp" )#, DEBUG=True) print(" > correct number of articles") assert len(result) == 67, len(result) print(" > OK") print(" > correct content of article 19") article_19 = result[22] assert article_19["type"] == "article", article_19["type"] assert article_19["titre"] == "19", article_19["titre"] print(" > OK") print(" > correct content of article 19 alineas") assert len(article_19["alineas"]) == 67, len(article_19["alineas"]) assert article_19["alineas"]["001"].startswith("I. - L'article 46 de la loi") assert article_19["alineas"]["067"].startswith("31 décembre de chaque année.")
def parse_first_working_url(urls: List[str]) -> List[dict]: for url in urls: articles: List[dict] = parse(url, include_annexes=True) if len(articles) > 1: return articles raise NotFound
'http://www.senat.fr/rap/l09-654/l09-654.html' ).url == 'http://www.senat.fr/rap/l09-654/l09-6542.html' # senat multi-page text assert find_good_url_resp( 'https://www.senat.fr/rap/l08-584/l08-584.html' ).url == 'https://www.senat.fr/rap/l08-584/l08-584_mono.html' # senat multipage examen en commission assert find_good_url_resp( 'https://www.senat.fr/rap/l09-535/l09-535.html' ).url == 'https://www.senat.fr/rap/l09-535/l09-5358.html' print('****** => url fixing OK ******') print() print('*** testing parse_texte ****') assert len( parse_texte.parse( 'http://www.assemblee-nationale.fr/13/rapports/r2568.asp')) == 5 assert len(parse_texte.parse('https://www.senat.fr/leg/ppl08-039.html')) == 2 print('****** => parse_texte OK ******') if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) download_groupes.process(OUTPUT_DIR) download_lois_dites.process(OUTPUT_DIR) opendata_an = download_AN_opendata.process(OUTPUT_DIR) print() print('*** testing merge ****') # complete AN urls dos, *_ = download_merged_dos('pjl11-497', opendata_an, verbose=False) assert find_anomalies([dos], verbose=False) == 0
# senat multi-page but not mono assert find_good_url_resp( 'http://www.senat.fr/rap/l09-654/l09-654.html' ).url == 'http://www.senat.fr/rap/l09-654/l09-6542.html' # senat multi-page text assert find_good_url_resp( 'https://www.senat.fr/rap/l08-584/l08-584.html' ).url == 'https://www.senat.fr/rap/l08-584/l08-584_mono.html' # senat multipage examen en commission assert find_good_url_resp( 'https://www.senat.fr/rap/l09-535/l09-535.html' ).url == 'https://www.senat.fr/rap/l09-535/l09-5358.html' print(' > OK ') print('> testing parse_texte') assert len( parse_texte.parse( 'http://www.assemblee-nationale.fr/13/rapports/r2568.asp')) == 5 assert len(parse_texte.parse('https://www.senat.fr/leg/ppl08-039.html')) == 2 # do not catch annexe as part of last article in legifrance texts assert len( parse_texte.parse( 'https://www.legifrance.gouv.fr/affichTexte.do?cidTexte=JORFTEXT000025005833' '&categorieLien=id#JORFSCTA000025005850')[-1]['alineas']) == 2 print(' > OK') if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) with log_print(only_log=True): download_groupes.process(OUTPUT_DIR) download_lois_dites.process(OUTPUT_DIR) opendata_an = download_AN_opendata.process(OUTPUT_DIR)
def parse_first_working_url(urls: List[str]) -> List[dict]: for url in urls: articles: List[dict] = parse(url) if articles: return articles raise NotFound