예제 #1
0
def scrape_pdf(url, filename):
    try:
        print("\ndownloading " + url)
        raw_pdf_name = url.split("/")[-1]
        import_date = datetime.now().replace(microsecond=0)
        pdf_name = "{}-{:02d}-{:02d}-{}".format(import_date.year,
                                                import_date.month,
                                                import_date.day, raw_pdf_name)
        pdf_helpers.get_pdf_from_admin_ch(url, pdf_name)

        print("\nextracting metadata...")
        creation_date = extract_creation_date(pdf_name)
        archive_pdf_name = "{}-{:02d}-{:02d}-{}".format(
            creation_date.year, creation_date.month, creation_date.day,
            raw_pdf_name)
        archive_filename = "{}-{:02d}-{:02d}-{}".format(
            creation_date.year, creation_date.month, creation_date.day,
            filename)
        print("\nPDF creation date: {:02d}.{:02d}.{}\n".format(
            creation_date.day, creation_date.month, creation_date.year))

        print("removing first page of PDF...")
        call([
            "qpdf", "--pages", pdf_name, "2-z", "--", pdf_name,
            "zb_file-stripped.pdf"
        ])

        print("parsing PDF...")
        call([
            "java",
            "-Djava.util.logging.config.file=web_scrapers/logging.properties",
            "-jar",
            get_script_path() + "/tabula-0.9.2-jar-with-dependencies.jar",
            "zb_file-stripped.pdf", "--pages", "all", "-o", "zb_data.csv"
        ])

        print("cleaning up parsed data...")
        guests = cleanup_file("zb_data.csv")

        print("writing " + filename + "...")
        write_to_json(guests, archive_pdf_name, filename, url, creation_date,
                      import_date)

        print("archiving...")
        copyfile(pdf_name,
                 get_script_path() + "/archive/{}".format(archive_pdf_name))
        copyfile(filename,
                 get_script_path() + "/archive/{}".format(archive_filename))

    finally:
        print("cleaning up...")
        os.rename(pdf_name, get_script_path() + "/backup/{}".format(pdf_name))
        backup_filename = "{}-{:02d}-{:02d}-{}".format(import_date.year,
                                                       import_date.month,
                                                       import_date.day,
                                                       filename)
        copyfile(filename,
                 get_script_path() + "/backup/{}".format(backup_filename))
        os.remove("zb_file-stripped.pdf")
        os.remove("zb_data.csv")
예제 #2
0
def scrape():
    parser = ArgumentParser(description='Scarpe Parlamentarische Gruppen PDF')
    parser.add_argument("local_pdf",
                        metavar="file",
                        nargs='?',
                        help="local PDF file to use",
                        default=None)
    args = parser.parse_args()
    local_pdf = args.local_pdf

    url = "https://www.parlament.ch/centers/documents/de/parlamentarische-gruppen.pdf"
    filename = "parlamentarische-gruppen.json"

    script_path = os.path.dirname(os.path.realpath(__file__))
    try:
        import_date = datetime.now().replace(microsecond=0)
        raw_pdf_name = url.split("/")[-1]
        pdf_name = "{}-{:02d}-{:02d}-{}".format(import_date.year,
                                                import_date.month,
                                                import_date.day, raw_pdf_name)
        if local_pdf is None:
            print("\ndownloading " + url)
            pdf_helpers.get_pdf_from_admin_ch(url, pdf_name)
        else:
            print("\ncopy local PDF " + local_pdf)
            copyfile(local_pdf, pdf_name)

        print("\nextracting metadata...")
        creation_date = pdf_helpers.extract_creation_date(pdf_name)
        archive_pdf_name = "{}-{:02d}-{:02d}-{}".format(
            creation_date.year, creation_date.month, creation_date.day,
            raw_pdf_name)
        archive_filename = "{}-{:02d}-{:02d}-{}".format(
            creation_date.year, creation_date.month, creation_date.day,
            filename)
        print("\nPDF creation date: {:02d}.{:02d}.{}\n".format(
            creation_date.day, creation_date.month, creation_date.year))

        print("parsing PDF...")
        FNULL = open(os.devnull, 'w')
        tabula_path = script_path + "/tabula-0.9.2-jar-with-dependencies.jar"
        call([
            "java", "-jar", tabula_path, pdf_name, "--pages", "all", "-o",
            "pg_data.csv"
        ],
             stderr=FNULL)

        print("cleaning up parsed data...")
        groups = cleanup_file("pg_data.csv")
        groups = normalize_namen(groups)

        print("writing " + filename + "...")
        write_to_json(groups, archive_pdf_name, filename, url, creation_date,
                      import_date)

        if local_pdf is None:
            print("archiving...")
            copyfile(pdf_name,
                     script_path + "/archive/{}".format(archive_pdf_name))
            copyfile(filename,
                     script_path + "/archive/{}".format(archive_filename))

    finally:
        print("cleaning up...")
        os.rename(pdf_name, script_path + "/backup/{}".format(pdf_name))
        backup_filename = "{}-{:02d}-{:02d}-{}".format(import_date.year,
                                                       import_date.month,
                                                       import_date.day,
                                                       filename)
        copyfile(filename, script_path + "/backup/{}".format(backup_filename))
        os.remove("pg_data.csv")
예제 #3
0
def scrape_pdf(url, local_pdf, filename):
    script_path = get_script_path()
    stripped_file_name = None
    try:
        raw_pdf_name = url.split("/")[-1]
        import_date = datetime.now().replace(microsecond=0)
        pdf_name = "{}-{:02d}-{:02d}-{}".format(import_date.year,
                                                import_date.month,
                                                import_date.day, raw_pdf_name)
        if local_pdf is None:
            print("\ndownloading " + url)
            pdf_helpers.get_pdf_from_admin_ch(url, pdf_name)
        else:
            print("\ncopy local PDF " + local_pdf)
            copyfile(local_pdf, pdf_name)

        print("\nextracting metadata...")
        creation_date = pdf_helpers.extract_creation_date(pdf_name)
        archive_pdf_name = "{}-{:02d}-{:02d}-{}".format(
            creation_date.year, creation_date.month, creation_date.day,
            raw_pdf_name)
        archive_filename = "{}-{:02d}-{:02d}-{}".format(
            creation_date.year, creation_date.month, creation_date.day,
            filename)
        print("\nPDF creation date: {:02d}.{:02d}.{}\n".format(
            creation_date.day, creation_date.month, creation_date.year))

        print("removing first page of PDF...")
        stripped_file_name = "zb_file-stripped.pdf"
        call([
            "qpdf", "--pages", pdf_name, "2-z", "--", pdf_name,
            stripped_file_name
        ])

        print("parsing PDF...")
        tabula_path = script_path + "/tabula-1.0.4-jar-with-dependencies.jar"
        cmd = [
            "java",
            "-Djava.util.logging.config.file=web_scrapers/logging.properties",
            "-jar", tabula_path, stripped_file_name, "-o", "zb_data.csv",
            "--pages", "all", "-l", "-i"
        ]
        print(" ".join(cmd))
        call(cmd, stderr=None)

        print("cleaning up parsed data...")
        guests = read_guests("zb_data.csv")

        print("writing " + filename + "...")
        write_to_json(guests, archive_pdf_name, filename, url, creation_date,
                      import_date)

        print("archiving...")
        copyfile(pdf_name,
                 script_path + "/archive/{}".format(archive_pdf_name))
        copyfile(filename,
                 script_path + "/archive/{}".format(archive_filename))

    finally:
        print("cleaning up...")
        os.rename(pdf_name, script_path + "/backup/{}".format(pdf_name))
        backup_filename = "{}-{:02d}-{:02d}-{}".format(import_date.year,
                                                       import_date.month,
                                                       import_date.day,
                                                       filename)
        copyfile(filename, script_path + "/backup/{}".format(backup_filename))
        if stripped_file_name and os.path.isfile(stripped_file_name):
            os.remove(stripped_file_name)
        if os.path.isfile("zb_data.csv"): os.remove("zb_data.csv")