def find_decrees_doc_urls(offices): for office_info in offices: docs = set() for url in office_info.get('decree_pages', []): sys.stderr.write(url + "\n") new_docs = find_links_in_page_with_urllib(url, check_decree_link_text) docs = docs.union(new_docs) additional_docs = set() for link in docs: sys.stderr.write("download " + link.link_url + "\n") try: new_docs = find_links_in_page_with_urllib( link, check_download_text) additional_docs = additional_docs.union(new_docs) except Exception as err: sys.stderr.write("cannot download " + link.link_url + ": " + str(err) + "\n") pass for link in additional_docs: sys.stderr.write("download additional " + link.link_url + "\n") try: download_with_cache(link.link_url) except Exception as err: sys.stderr.write("cannot download " + link.link_url + ": " + str(err) + "\n") pass docs = docs.union(additional_docs) office_info['anticor_doc_urls'] = [x.to_json() for x in docs] write_offices(offices)
def find_office_decrees_section(offices): for office_info in offices: url = office_info.get('law_div', {}).get('url', '') if url == '': sys.stderr.write("skip url " + office_info['url'] + " (no law div info)\n") continue sys.stderr.write(url + "\n") click_first_link_and_get_url(office_info, 'office_decrees', url, check_office_decree_link_text) write_offices(offices)
def get_decree_pages(offices): for office_info in offices: law_div = office_info.get('law_div', {}) main_link = TLink(json_dict=law_div) if main_link.link_url == '': sys.stderr.write("skip url " + office_info['url'] + " (no law div info) \n") continue office_link = TLink(json_dict=office_info.get('office_decrees', {})) if office_link.link_url != "": main_link = office_link all_links = collect_all_subpages_urls(main_link.link_url) office_info['decree_pages'] = list(l.to_json() for l in all_links) write_offices(offices)
def find_law_div(offices): for office_info in offices: url = office_info.get('anticorruption_div', {}).get('url', '') if url == '': sys.stderr.write("skip url " + office_info['url'] + " (no div info) \n") continue if office_info.get('law_div', {}).get('engine', '') == 'manual': sys.stderr.write("skip manual url updating " + url + "\n") continue sys.stderr.write(url + "\n") click_first_link_and_get_url(office_info, 'law_div', url, check_law_link_text) write_offices(offices)
def convert_to_text(offices): for office_info in offices: txtfiles = [] for d in office_info.get('anticor_doc_urls', []): link = TLink(json_dict=d) try: file_name = build_temp_local_file(link.link_url) if file_name == "": continue txt_file = file_name + ".txt" if not os.path.exists(txt_file) or os.path.getsize( txt_file) == 0: cmd = "..\\DocConvertor\\DocConvertor\\DocConvertor\\bin\\Debug\\DocConvertor.exe {} > {}".format( file_name, txt_file) sys.stderr.write(cmd + "\n") os.system(cmd) if os.path.exists(txt_file) and os.path.getsize(txt_file) > 0: txtfiles.append(txt_file) except Exception as err: sys.stderr.write(str(err) + "\n") office_info['txt_files'] = txtfiles write_offices(offices)