def find_amendements(path): for amdts_file in glob.glob(os.path.join(path, '**/amendements_*'), recursive=True): amendements = open_json(amdts_file) for subject in amendements.get('sujets', {}).values(): for amdt in subject.get('amendements', []): yield amdt, amdts_file
def find_parsed_doslegs(api_directory): dossiers_json = {} for path in glob.glob(os.path.join(api_directory, '**/procedure.json'), recursive=True): dos = open_json(path) if dos.get('senat_id'): dossiers_json[dos['senat_id']] = dos print(len(dossiers_json), 'parsed found') return dossiers_json
def read_text(path): # TODO: format tables try: articles = open_json(os.path.dirname(path), os.path.basename(path))["articles"] except FileNotFoundError: return "" texte = "" for art in articles: texte += "# Article " + art["titre"] + "\n\n" for key in sorted(art["alineas"].keys()): if art["alineas"][key] != "": texte += art["alineas"][key] + "\n" texte += "\n" return texte
def process(output_dir, dos): stats = {} # # # INTERVENTIONS # # # intervs = open_json(os.path.join(output_dir, 'viz/interventions.json')) # only keep seances in hemicycle intervs = {step_name: step for step_name, step in intervs.items() if '_hemicycle' in step_name} stats['total_mots'] = sum([ sum(i['total_mots'] for i in step['divisions'].values()) for step in intervs.values() ]) stats["total_intervenants"] = len({orat for step in intervs.values() for orat in step['orateurs'].keys()}) stats["total_interventions"] = sum({division['total_intervs'] for step in intervs.values() for division in step['divisions'].values()}) stats["total_seances"] = sum([step['total_seances'] for step in intervs.values()]) stats["total_seances_assemblee"] = sum([step['total_seances'] for dir, step in intervs.items() if '_assemblee' in dir]) stats["total_seances_senat"] = sum([step['total_seances'] for dir, step in intervs.items() if '_senat' in dir]) # # # AMENDMENTS # # # add_amendments_stats(stats, find_amendements(output_dir)) # # # TEXTS # # # first_step, last_step = find_first_and_last_steps(dos) first_arts = read_articles(first_step) last_arts = read_articles(last_step) stats["total_input_articles"] = len(first_arts) stats["total_output_articles"] = len(last_arts) stats["ratio_articles_growth"] = (stats["total_output_articles"] - stats["total_input_articles"]) / stats["total_input_articles"] stats["input_text_length"] = step_text_length(first_step) stats["output_text_length"] = step_text_length(last_step) stats["ratio_text_length_growth"] = (stats["output_text_length"] - stats["input_text_length"]) / stats["input_text_length"] stats["input_text_word_count"] = step_word_count(first_step) stats["output_text_word_count"] = step_word_count(last_step) stats["ratio_word_count_growth"] = (stats["output_text_word_count"] - stats["input_text_word_count"]) / stats["input_text_word_count"] adopted_step = find_first_and_last_steps(dos, include_CC=False)[1] if has_been_censored(dos): stats["censored_articles"], stats["fully_censored_articles"] = count_censored_articles(last_step) stats["output_text_before_CC_length"] = step_text_length(adopted_step) stats["output_text_before_CC_word_count"] = step_word_count(adopted_step) stats["ratio_texte_modif"] = 1 - compute_similarity_by_articles(first_arts, last_arts) # # # PROCEDURE # # # stats["echecs_procedure"] = len([step for step in dos['steps'] if step.get("echec")]) # TODO: first institution stats['last_stage'] = adopted_step.get('stage') if stats['last_stage'] == 'CMP': stats['last_institution'] = 'CMP' else: stats['last_institution'] = adopted_step.get('institution') maxdate = dos.get('end') if not maxdate: for step in dos['steps']: if step.get('date'): maxdate = step.get('enddate') or step.get('date') stats["total_days"] = (datize(maxdate) - datize(dos['beginning'])).days + 1 stats["attached_law_proposals"] = count_initial_depots(dos['steps']) - 1 stats["depots_in_institutions"] = count_navettes(dos['steps']) stats["texts_produced"] = count_texts(dos['steps']) return stats
stats["ratio_texte_modif"] = 1 - compute_similarity_by_articles(first_arts, last_arts) # # # PROCEDURE # # # stats["echecs_procedure"] = len([step for step in dos['steps'] if step.get("echec")]) # TODO: first institution stats['last_stage'] = adopted_step.get('stage') if stats['last_stage'] == 'CMP': stats['last_institution'] = 'CMP' else: stats['last_institution'] = adopted_step.get('institution') maxdate = dos.get('end') if not maxdate: for step in dos['steps']: if step.get('date'): maxdate = step.get('enddate') or step.get('date') stats["total_days"] = (datize(maxdate) - datize(dos['beginning'])).days + 1 stats["attached_law_proposals"] = count_initial_depots(dos['steps']) - 1 stats["depots_in_institutions"] = count_navettes(dos['steps']) stats["texts_produced"] = count_texts(dos['steps']) return stats if __name__ == '__main__': print_json(process(sys.argv[1], open_json(sys.argv[2])))
for a in sorted(out['articles']): new_steps = [] for s in out['articles'][a]['steps']: del s['text'] s.pop('_original_index', None) if len(new_steps) > 0 and new_steps[-1]['id_step'] == s['id_step']: print('same id_step', s['id_step'], file=sys.stderr) continue new_steps.append(s) out['articles'][a]['steps'] = new_steps # Set articles' order values after having reinserted missing ones orders = {k: n for n, k in enumerate( sorted( [a['titre'] for a in out['articles'].values() if a['id'] != 'echec'], key=cmp_to_key(compare_articles) )) } for a in out['articles'].values(): if a['id'] == 'echec': a['order'] = -1 else: a['order'] = orders[a['titre']] return out if __name__ == '__main__': print_json(process(open_json(sys.argv[1])))
a["statut"] = "conforme" a["order"] = order order += 1 write_json(a) # do not keep already deleted articles but mark as deleted missing ones elif not re_suppr.match(a["statut"]) or texte.get('echec', ''): # if the last line of text was some dots, it means that we should keep # the articles as-is if they are not deleted if line['type'] == 'dots': # ex: https://www.senat.fr/leg/ppl09-304.html log("DEBUG: Recovering art as non-modifié via dots %s (leftovers)" % cur) a["statut"] = "non modifié" a["order"] = order order += 1 write_json(a) else: log("DEBUG: Marking art %s as supprimé (leftovers)" % cur) a["statut"] = "supprimé" a["alineas"] = dict() a["order"] = order order += 1 write_json(a) return ALL_ARTICLES if __name__ == '__main__': serialized = open_json(sys.argv[1]) serialized["debug"] = True result = complete(**serialized) print_json(result)
GITLAB_TOKEN = sys.argv[2] if len(sys.argv) == 3 else None if GITLAB_TOKEN: gl = gitlab.Gitlab('https://git.regardscitoyens.org/', private_token=GITLAB_TOKEN) group = gl.groups.list(search='parlement')[0] # delete existing bills projects = group.projects.list() for project in projects: print('delete', project.id) gl.projects.delete(project.id) for procedure_file in sorted( glob.glob("data/**/procedure.json", recursive=True)): procedure = open_json(procedure_file) if len(procedure["steps"]) < 5: continue if procedure["stats"]["total_amendements"] < 5: continue project_dir = os.path.dirname(os.path.dirname(procedure_file)) git_dir = Path(GIT_REPOS_DIRECTORY) / procedure["id"] shutil.rmtree(str(git_dir), ignore_errors=True) os.makedirs(str(git_dir)) remote_url = "[email protected]:/parlement/{bill}.git".format( bill=procedure["id"])
import sys, os, random, glob from tlfp.tools.common import open_json, print_json if len(sys.argv) < 2: print( 'USAGE: "python steps_as_dot.py <data_directory> | dot -Tpng > steps.png"' ) sys.exit() mode = "detailed" if len(sys.argv) == 3 else "simple" procedure_file = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), '..', 'docs', 'valid_procedure.json') procedure = open_json(procedure_file) API_DIRECTORY = sys.argv[1] all_senat_jo = [open_json(path) for path \ in glob.glob(os.path.join(API_DIRECTORY, '*/viz/procedure.json'))] all_senat_jo = [dos for dos in all_senat_jo if dos.get('end')] # all_senat_jo = [x for x in open_json(sys.argv[1]) if len(x['steps']) > 2] # all_senat_jo = random.sample(all_senat_jo, 5) nodes_names_size = {} step_trans = {} steps_logs = "" for dos in all_senat_jo: prev_step = None last_step = '' for step_i, step in enumerate(dos.get('steps', [])):
order += 1 write_json(a) # do not keep already deleted articles but mark as deleted missing ones elif not re_suppr.match(a["statut"]) or texte.get('echec', ''): # if the last line of text was some dots, it means that we should keep # the articles as-is if they are not deleted if line['type'] == 'dots': # ex: https://www.senat.fr/leg/ppl09-304.html log("DEBUG: Recovering art as non-modifié via dots %s (leftovers)" % cur) a["statut"] = "non modifié" a["order"] = order order += 1 write_json(a) else: log("DEBUG: Marking art %s as supprimé (leftovers)" % cur) a["statut"] = "supprimé" a["alineas"] = dict() a["order"] = order order += 1 write_json(a) return ALL_ARTICLES if __name__ == '__main__': serialized = open_json(sys.argv[1]) serialized["debug"] = True result = complete(**serialized) print_json(result)
""" Usage: python generate_dossiers_csv.py <api_directory> Output in <api_directory>: - dossiers_promulgues.csv with all the doslegs ready - home.json for the homepage informations """ import glob, os, sys, csv, re, copy, datetime from tlfp.tools.common import upper_first, open_json, print_json API_DIRECTORY = sys.argv[1] re_dos_ok = re.compile(r"%s/[^.]+/" % API_DIRECTORY.strip('/')) dossiers = [(open_json(path), path) for path \ in glob.glob(os.path.join(API_DIRECTORY, '*/viz/procedure.json')) if re_dos_ok.search(path)] dossiers = [(dos, path) for dos, path in dossiers if "_tmp" not in path] csvfile = csv.writer(open(os.path.join(API_DIRECTORY, 'dossiers.csv'), 'w'), delimiter=';') csvfile.writerow(('id;Titre;Type de dossier;Date initiale;URL du dossier;État du dossier;Décision du CC;' 'Date de la décision;Date de promulgation;Numéro de la loi;Thèmes;total_amendements;total_mots;' 'short_title;loi_dite;assemblee_id').split(';')) def format_date_for_human(date): if not date: return '' return '/'.join(reversed(date.split('-'))) def in_room(step):
"count": len(tosave), "page": npage, "next_page": None, "dossiers": tosave } if done < total: data["next_page"] = namefile(npage + 1) print('[assemble_procedure] >', namefile(npage)) print_json(data, os.path.join(sourcedir, namefile(npage))) done = 0 tosave = [] for d in dossiers: proc = open_json(os.path.join(sourcedir, d['id'], 'viz'), 'procedure.json') proc["id"] = d["id"] for f in ["table_concordance", "objet_du_texte"]: if f in proc: proc.pop(f) tosave.append(proc) done += 1 if done % pagesize == 0: save_json_page(tosave, done) tosave = [] if tosave: save_json_page(tosave, done)
# quick script to produce a DOT file of the steps from a list of dosleg # use "python steps_as_dot.py <data_directory> | dot -Tpng > steps.png" to produce the diagram # the XKCD font is available here: https://github.com/ipython/xkcd-font/tree/master/xkcd/build import sys, os, random, glob from tlfp.tools.common import open_json, print_json if len(sys.argv) < 2: print('USAGE: "python steps_as_dot.py <data_directory> | dot -Tpng > steps.png"') sys.exit() mode = "detailed" if len(sys.argv) == 3 else "simple" procedure_file = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), '..', 'docs', 'valid_procedure.json') procedure = open_json(procedure_file) API_DIRECTORY = sys.argv[1] all_senat_jo = [open_json(path) for path \ in glob.glob(os.path.join(API_DIRECTORY, '*/viz/procedure.json'))] all_senat_jo = [dos for dos in all_senat_jo if dos.get('end')] # all_senat_jo = [x for x in open_json(sys.argv[1]) if len(x['steps']) > 2] # all_senat_jo = random.sample(all_senat_jo, 5) nodes_names_size = {} step_trans = {} steps_logs = "" for dos in all_senat_jo: prev_step = None last_step = '' for step_i, step in enumerate(dos.get('steps', [])):
def process(output_dir, dos): stats = {} # # # INTERVENTIONS # # # intervs = open_json(os.path.join(output_dir, 'viz/interventions.json')) # only keep seances in hemicycle intervs = { step_name: step for step_name, step in intervs.items() if '_hemicycle' in step_name } stats['total_mots'] = sum([ sum(i['total_mots'] for i in step['divisions'].values()) for step in intervs.values() ]) stats["total_intervenants"] = len({ orat for step in intervs.values() for orat in step['orateurs'].keys() }) stats["total_interventions"] = sum({ division['total_intervs'] for step in intervs.values() for division in step['divisions'].values() }) stats["total_seances"] = sum( [step['total_seances'] for step in intervs.values()]) stats["total_seances_assemblee"] = sum([ step['total_seances'] for dir, step in intervs.items() if '_assemblee' in dir ]) stats["total_seances_senat"] = sum([ step['total_seances'] for dir, step in intervs.items() if '_senat' in dir ]) # # # AMENDMENTS # # # add_amendments_stats(stats, find_amendements(output_dir)) # # # TEXTS # # # first_step, last_step = find_first_and_last_steps(dos) first_arts = read_articles(first_step) last_arts = read_articles(last_step) stats["total_input_articles"] = len(first_arts) stats["total_output_articles"] = len(last_arts) stats["ratio_articles_growth"] = ( stats["total_output_articles"] - stats["total_input_articles"]) / stats["total_input_articles"] stats["input_text_length"] = step_text_length(first_step) stats["output_text_length"] = step_text_length(last_step) stats["ratio_text_length_growth"] = ( stats["output_text_length"] - stats["input_text_length"]) / stats["input_text_length"] stats["input_text_word_count"] = step_word_count(first_step) stats["output_text_word_count"] = step_word_count(last_step) stats["ratio_word_count_growth"] = ( stats["output_text_word_count"] - stats["input_text_word_count"]) / stats["input_text_word_count"] adopted_step = find_first_and_last_steps(dos, include_CC=False)[1] if has_been_censored(dos): stats["censored_articles"], stats[ "fully_censored_articles"] = count_censored_articles(last_step) stats["output_text_before_CC_length"] = step_text_length(adopted_step) stats["output_text_before_CC_word_count"] = step_word_count( adopted_step) stats["ratio_texte_modif"] = 1 - compute_similarity_by_articles( first_arts, last_arts) # # # PROCEDURE # # # stats["echecs_procedure"] = len( [step for step in dos['steps'] if step.get("echec")]) # TODO: first institution stats['last_stage'] = adopted_step.get('stage') if stats['last_stage'] == 'CMP': stats['last_institution'] = 'CMP' else: stats['last_institution'] = adopted_step.get('institution') maxdate = dos.get('end') if not maxdate: for step in dos['steps']: if step.get('date'): maxdate = step.get('enddate') or step.get('date') stats["total_days"] = (datize(maxdate) - datize(dos['beginning'])).days + 1 stats["attached_law_proposals"] = count_initial_depots(dos['steps']) - 1 stats["depots_in_institutions"] = count_navettes(dos['steps']) stats["texts_produced"] = count_texts(dos['steps']) return stats
first_arts, last_arts) # # # PROCEDURE # # # stats["echecs_procedure"] = len( [step for step in dos['steps'] if step.get("echec")]) # TODO: first institution stats['last_stage'] = adopted_step.get('stage') if stats['last_stage'] == 'CMP': stats['last_institution'] = 'CMP' else: stats['last_institution'] = adopted_step.get('institution') maxdate = dos.get('end') if not maxdate: for step in dos['steps']: if step.get('date'): maxdate = step.get('enddate') or step.get('date') stats["total_days"] = (datize(maxdate) - datize(dos['beginning'])).days + 1 stats["attached_law_proposals"] = count_initial_depots(dos['steps']) - 1 stats["depots_in_institutions"] = count_navettes(dos['steps']) stats["texts_produced"] = count_texts(dos['steps']) return stats if __name__ == '__main__': print_json(process(sys.argv[1], open_json(sys.argv[2])))
for s in out['articles'][a]['steps']: del s['text'] s.pop('_original_index', None) if len(new_steps) > 0 and new_steps[-1]['id_step'] == s['id_step']: print('same id_step', s['id_step'], file=sys.stderr) continue new_steps.append(s) out['articles'][a]['steps'] = new_steps # Set articles' order values after having reinserted missing ones orders = { k: n for n, k in enumerate( sorted([ a['titre'] for a in out['articles'].values() if a['id'] != 'echec' ], key=cmp_to_key(compare_articles))) } for a in out['articles'].values(): if a['id'] == 'echec': a['order'] = -1 else: a['order'] = orders[a['titre']] return out if __name__ == '__main__': print_json(process(open_json(sys.argv[1])))
npage = (done - 1) // pagesize data = {"total": total, "count": len(tosave), "page": npage, "next_page": None, "dossiers": tosave} if done < total: data["next_page"] = namefile(npage+1) print('[assemble_procedure] >', namefile(npage)) print_json(data, os.path.join(sourcedir, namefile(npage))) done = 0 tosave = [] for d in dossiers: proc = open_json(os.path.join(sourcedir, d['id'], 'viz'), 'procedure.json') proc["id"] = d["id"] for f in ["table_concordance", "objet_du_texte"]: if f in proc: proc.pop(f) tosave.append(proc) done += 1 if done % pagesize == 0: save_json_page(tosave, done) tosave = [] if tosave: save_json_page(tosave, done)