예제 #1
0
def find_decrees_doc_urls(offices):
    for office_info in offices:
        docs = set()
        for url in office_info.get('decree_pages', []):
            sys.stderr.write(url + "\n")
            new_docs = find_links_in_page_with_urllib(url,
                                                      check_decree_link_text)
            docs = docs.union(new_docs)

        additional_docs = set()
        for link in docs:
            sys.stderr.write("download " + link.link_url + "\n")
            try:
                new_docs = find_links_in_page_with_urllib(
                    link, check_download_text)
                additional_docs = additional_docs.union(new_docs)
            except Exception as err:
                sys.stderr.write("cannot download " + link.link_url + ": " +
                                 str(err) + "\n")
                pass

        for link in additional_docs:
            sys.stderr.write("download additional " + link.link_url + "\n")
            try:
                download_with_cache(link.link_url)
            except Exception as err:
                sys.stderr.write("cannot download " + link.link_url + ": " +
                                 str(err) + "\n")
                pass

        docs = docs.union(additional_docs)
        office_info['anticor_doc_urls'] = [x.to_json() for x in docs]
    write_offices(offices)
예제 #2
0
def find_office_decrees_section(offices):
    for office_info in offices:
        url = office_info.get('law_div', {}).get('url', '')
        if url == '':
            sys.stderr.write("skip url " + office_info['url'] +
                             " (no law div info)\n")
            continue
        sys.stderr.write(url + "\n")
        click_first_link_and_get_url(office_info, 'office_decrees', url,
                                     check_office_decree_link_text)

    write_offices(offices)
예제 #3
0
def get_decree_pages(offices):
    for office_info in offices:
        law_div = office_info.get('law_div', {})
        main_link = TLink(json_dict=law_div)
        if main_link.link_url == '':
            sys.stderr.write("skip url " + office_info['url'] +
                             " (no law div info) \n")
            continue
        office_link = TLink(json_dict=office_info.get('office_decrees', {}))
        if office_link.link_url != "":
            main_link = office_link
        all_links = collect_all_subpages_urls(main_link.link_url)
        office_info['decree_pages'] = list(l.to_json() for l in all_links)

    write_offices(offices)
예제 #4
0
def find_law_div(offices):
    for office_info in offices:
        url = office_info.get('anticorruption_div', {}).get('url', '')
        if url == '':
            sys.stderr.write("skip url " + office_info['url'] +
                             " (no div info) \n")
            continue
        if office_info.get('law_div', {}).get('engine', '') == 'manual':
            sys.stderr.write("skip manual url updating " + url + "\n")
            continue
        sys.stderr.write(url + "\n")
        click_first_link_and_get_url(office_info, 'law_div', url,
                                     check_law_link_text)

    write_offices(offices)
예제 #5
0
def convert_to_text(offices):
    for office_info in offices:
        txtfiles = []
        for d in office_info.get('anticor_doc_urls', []):
            link = TLink(json_dict=d)

            try:
                file_name = build_temp_local_file(link.link_url)
                if file_name == "":
                    continue
                txt_file = file_name + ".txt"
                if not os.path.exists(txt_file) or os.path.getsize(
                        txt_file) == 0:
                    cmd = "..\\DocConvertor\\DocConvertor\\DocConvertor\\bin\\Debug\\DocConvertor.exe {} > {}".format(
                        file_name, txt_file)
                    sys.stderr.write(cmd + "\n")
                    os.system(cmd)
                if os.path.exists(txt_file) and os.path.getsize(txt_file) > 0:
                    txtfiles.append(txt_file)
            except Exception as err:
                sys.stderr.write(str(err) + "\n")

        office_info['txt_files'] = txtfiles
    write_offices(offices)