def parse_protocol(row): original_filename, ext, output_filename, full_output_filename, download_filename = get_filenames( row) if os.path.exists(full_output_filename): logging.info('file exists: {}'.format(full_output_filename)) stats["existing files"] += 1 filesize = os.path.getsize(full_output_filename) crc32c = get_crc32c(full_output_filename) logging.info('existing file: {}'.format(full_output_filename)) elif os.path.exists(download_filename): with open(download_filename, "rb") as f: with CommitteeMeetingProtocol.get_from_file(f) as protocol: os.makedirs(os.path.dirname(full_output_filename), exist_ok=True) with utils.temp_file() as temp_filename: with open(temp_filename, "w") as of: if parse_type == "text": of.write(protocol.text) else: csv_writer = csv.writer(of) csv_writer.writerow(["header", "body"]) for part in protocol.parts: csv_writer.writerow([part.header, part.body]) shutil.copy(temp_filename, full_output_filename) filesize = os.path.getsize(full_output_filename) crc32c = get_crc32c(full_output_filename) logging.info('parsed file: {}'.format(full_output_filename)) stats["parsed files"] += 1 else: logging.warning('missing document committee session file: {}'.format( download_filename)) ext, output_filename, filesize, crc32c = None, None, 0, None return ext, output_filename, filesize, crc32c
def get_resource(): for row_num, row in enumerate(download_rows): logging.info("{} / {}".format(row_num, len(download_rows))) try: original_filename = os.path.join("files", str(row["GroupTypeID"]), str(row["DocumentCommitteeSessionID"])[0], str(row["DocumentCommitteeSessionID"])[1], str(row["DocumentCommitteeSessionID"]) + "." + row["ApplicationDesc"]) ext = os.path.splitext(original_filename)[1].lower() output_filename = "files/{}/{}/{}.{}".format(str(row["CommitteeSessionID"])[0], str(row["CommitteeSessionID"])[1], str(row["CommitteeSessionID"]), "csv" if parse_type == "parts" else "txt") if not files_limit or stats["parsed files"] < files_limit: if download_from_path: download_filename = "../data/committees/download_document_committee_session/" + original_filename if os.path.exists(download_filename): with open(download_filename, "rb") as f: with CommitteeMeetingProtocol.get_from_file(f) as protocol: parse_protocol(output_filename, protocol) else: logging.warning("missing download_filename {}".format(download_filename)) elif download_from_remote_storage: url = download_from_remote_storage + original_filename with CommitteeMeetingProtocol.get_from_url(url) as protocol: parse_protocol(output_filename, protocol) else: raise Exception("no valid download option") row.update(protocol_extension=ext, parsed_filename=output_filename) yield row except Exception as e: # there is a bug in knesset-data-python which prevents getting the error message from the exception # TODO: fix this bug error_message = "failed to parse CommitteeSessionID {}".format(row["CommitteeSessionID"]) # , str(e)) logging.exception(error_message) row.update(error=error_message) errors.append(row)
def process_row(row, row_index, resource_descriptor, resource_index, parameters, stats): if resource_descriptor['name'] == 'kns_documentcommitteesession': t = parameters['type'] row[t + "_protocol_extension"] = None row[t + "_parsed_filename"] = None row[t + "_filesize"] = 0 row[t + "_crc32c"] = None row[t + "_error"] = None if (row['GroupTypeID'] == 23 and row['ApplicationDesc'] == 'DOC' and (row["FilePath"].lower().endswith('.doc') or row["FilePath"].lower().endswith('.docx'))): document_id = "{}-{}-{}".format(row["GroupTypeID"], row["DocumentCommitteeSessionID"], row["ApplicationDesc"]) original_filename, ext, output_filename, full_output_filename, download_filename, full_output_hash_filename = get_filenames( row, parameters) if os.path.exists(download_filename) and row.get( 'download_crc32c'): m = BASE_HASH_OBJ.copy() m.update(row['download_crc32c'].encode()) new_cache_hash = m.hexdigest() if os.path.exists(full_output_filename) and os.path.exists( full_output_hash_filename): with open(full_output_hash_filename) as f: old_cache_hash = f.read() else: old_cache_hash = None if old_cache_hash and new_cache_hash and new_cache_hash == old_cache_hash: stats[t + ": existing files"] += 1 row[t + "_protocol_extension"] = ext row[t + "_parsed_filename"] = output_filename row[t + "_filesize"] = os.path.getsize(full_output_filename) row[t + "_crc32c"] = get_crc32c(full_output_filename) elif parameters.get('files-limit') and parameters[ 'files-limit'] <= stats[t + ": parsed files"]: row[t + "_error"] = 'reached files-limit, skipping' stats[t + ": skipped files"] += 1 else: error_string = None try: with open(download_filename, "rb") as f: with CommitteeMeetingProtocol.get_from_file( f) as protocol: os.makedirs( os.path.dirname(full_output_filename), exist_ok=True) with utils.temp_file() as temp_filename: with open(temp_filename, "w") as of: if parameters['type'] == "text": of.write(protocol.text) else: csv_writer = csv.writer(of) csv_writer.writerow( ["header", "body"]) for part in protocol.parts: csv_writer.writerow( [part.header, part.body]) shutil.copy(temp_filename, full_output_filename) except Exception as e: logging.exception( 'exception parsing protocol for {}'.format( document_id)) try: error_string = str(e) except Exception: error_string = 'unexpected exception' if error_string: row[t + "_error"] = error_string stats[t + ': errored files'] += 1 else: row[t + "_protocol_extension"] = ext row[t + "_parsed_filename"] = output_filename row[t + "_filesize"] = os.path.getsize( full_output_filename) row[t + "_crc32c"] = get_crc32c(full_output_filename) stats[t + ": parsed files"] += 1 with open(full_output_hash_filename, 'w') as f: f.write(new_cache_hash) else: row[t + "_error"] = 'missing download file' stats[t + ': missing download files'] += 1 return row