示例#1
0
def parse_protocol(row):
    original_filename, ext, output_filename, full_output_filename, download_filename = get_filenames(
        row)
    if os.path.exists(full_output_filename):
        logging.info('file exists: {}'.format(full_output_filename))
        stats["existing files"] += 1
        filesize = os.path.getsize(full_output_filename)
        crc32c = get_crc32c(full_output_filename)
        logging.info('existing file: {}'.format(full_output_filename))
    elif os.path.exists(download_filename):
        with open(download_filename, "rb") as f:
            with CommitteeMeetingProtocol.get_from_file(f) as protocol:
                os.makedirs(os.path.dirname(full_output_filename),
                            exist_ok=True)
                with utils.temp_file() as temp_filename:
                    with open(temp_filename, "w") as of:
                        if parse_type == "text":
                            of.write(protocol.text)
                        else:
                            csv_writer = csv.writer(of)
                            csv_writer.writerow(["header", "body"])
                            for part in protocol.parts:
                                csv_writer.writerow([part.header, part.body])
                    shutil.copy(temp_filename, full_output_filename)
        filesize = os.path.getsize(full_output_filename)
        crc32c = get_crc32c(full_output_filename)
        logging.info('parsed file: {}'.format(full_output_filename))
        stats["parsed files"] += 1
    else:
        logging.warning('missing document committee session file: {}'.format(
            download_filename))
        ext, output_filename, filesize, crc32c = None, None, 0, None
    return ext, output_filename, filesize, crc32c
def get_resource():
    for row_num, row in enumerate(download_rows):
        logging.info("{} / {}".format(row_num, len(download_rows)))
        try:
            original_filename = os.path.join("files", str(row["GroupTypeID"]),
                                             str(row["DocumentCommitteeSessionID"])[0],
                                             str(row["DocumentCommitteeSessionID"])[1],
                                             str(row["DocumentCommitteeSessionID"]) + "." + row["ApplicationDesc"])
            ext = os.path.splitext(original_filename)[1].lower()
            output_filename = "files/{}/{}/{}.{}".format(str(row["CommitteeSessionID"])[0],
                                                         str(row["CommitteeSessionID"])[1],
                                                         str(row["CommitteeSessionID"]),
                                                         "csv" if parse_type == "parts" else "txt")
            if not files_limit or stats["parsed files"] < files_limit:
                if download_from_path:
                    download_filename = "../data/committees/download_document_committee_session/" + original_filename
                    if os.path.exists(download_filename):
                        with open(download_filename, "rb") as f:
                            with CommitteeMeetingProtocol.get_from_file(f) as protocol:
                                parse_protocol(output_filename, protocol)
                    else:
                        logging.warning("missing download_filename {}".format(download_filename))
                elif download_from_remote_storage:
                    url = download_from_remote_storage + original_filename
                    with CommitteeMeetingProtocol.get_from_url(url) as protocol:
                        parse_protocol(output_filename, protocol)
                else:
                    raise Exception("no valid download option")
            row.update(protocol_extension=ext,
                       parsed_filename=output_filename)
            yield row
        except Exception as e:
            # there is a bug in knesset-data-python which prevents getting the error message from the exception
            # TODO: fix this bug
            error_message = "failed to parse CommitteeSessionID {}".format(row["CommitteeSessionID"])  # , str(e))
            logging.exception(error_message)
            row.update(error=error_message)
            errors.append(row)
示例#3
0
def process_row(row, row_index, resource_descriptor, resource_index,
                parameters, stats):
    if resource_descriptor['name'] == 'kns_documentcommitteesession':
        t = parameters['type']
        row[t + "_protocol_extension"] = None
        row[t + "_parsed_filename"] = None
        row[t + "_filesize"] = 0
        row[t + "_crc32c"] = None
        row[t + "_error"] = None
        if (row['GroupTypeID'] == 23 and row['ApplicationDesc'] == 'DOC'
                and (row["FilePath"].lower().endswith('.doc')
                     or row["FilePath"].lower().endswith('.docx'))):
            document_id = "{}-{}-{}".format(row["GroupTypeID"],
                                            row["DocumentCommitteeSessionID"],
                                            row["ApplicationDesc"])
            original_filename, ext, output_filename, full_output_filename, download_filename, full_output_hash_filename = get_filenames(
                row, parameters)
            if os.path.exists(download_filename) and row.get(
                    'download_crc32c'):
                m = BASE_HASH_OBJ.copy()
                m.update(row['download_crc32c'].encode())
                new_cache_hash = m.hexdigest()
                if os.path.exists(full_output_filename) and os.path.exists(
                        full_output_hash_filename):
                    with open(full_output_hash_filename) as f:
                        old_cache_hash = f.read()
                else:
                    old_cache_hash = None
                if old_cache_hash and new_cache_hash and new_cache_hash == old_cache_hash:
                    stats[t + ": existing files"] += 1
                    row[t + "_protocol_extension"] = ext
                    row[t + "_parsed_filename"] = output_filename
                    row[t +
                        "_filesize"] = os.path.getsize(full_output_filename)
                    row[t + "_crc32c"] = get_crc32c(full_output_filename)
                elif parameters.get('files-limit') and parameters[
                        'files-limit'] <= stats[t + ": parsed files"]:
                    row[t + "_error"] = 'reached files-limit, skipping'
                    stats[t + ": skipped files"] += 1
                else:
                    error_string = None
                    try:
                        with open(download_filename, "rb") as f:
                            with CommitteeMeetingProtocol.get_from_file(
                                    f) as protocol:
                                os.makedirs(
                                    os.path.dirname(full_output_filename),
                                    exist_ok=True)
                                with utils.temp_file() as temp_filename:
                                    with open(temp_filename, "w") as of:
                                        if parameters['type'] == "text":
                                            of.write(protocol.text)
                                        else:
                                            csv_writer = csv.writer(of)
                                            csv_writer.writerow(
                                                ["header", "body"])
                                            for part in protocol.parts:
                                                csv_writer.writerow(
                                                    [part.header, part.body])
                                    shutil.copy(temp_filename,
                                                full_output_filename)
                    except Exception as e:
                        logging.exception(
                            'exception parsing protocol for {}'.format(
                                document_id))
                        try:
                            error_string = str(e)
                        except Exception:
                            error_string = 'unexpected exception'
                    if error_string:
                        row[t + "_error"] = error_string
                        stats[t + ': errored files'] += 1
                    else:
                        row[t + "_protocol_extension"] = ext
                        row[t + "_parsed_filename"] = output_filename
                        row[t + "_filesize"] = os.path.getsize(
                            full_output_filename)
                        row[t + "_crc32c"] = get_crc32c(full_output_filename)
                        stats[t + ": parsed files"] += 1
                        with open(full_output_hash_filename, 'w') as f:
                            f.write(new_cache_hash)
            else:
                row[t + "_error"] = 'missing download file'
                stats[t + ': missing download files'] += 1
    return row