def _save_url(self, url, bucket, object_name, num_retries, seconds_between_retries, retry_num=1): try: res = self._reuqests_get(url) except RequestException as e: if retry_num < num_retries: logging.exception(e) logging.info( "retry {} / {}, waiting {} seconds before retrying...". format(retry_num, num_retries, seconds_between_retries)) time.sleep(seconds_between_retries) return self._save_url(url, bucket, object_name, num_retries, seconds_between_retries, retry_num + 1) else: raise if res.status_code == 200: object_storage.write(self.s3, bucket, object_name, res.content) return True else: return False
def _save_schema_html(self, save_schema): object_name = save_schema.format(table_name=self._tablename, ext="html") bucket = self._parameters["schemas-bucket"] object_storage.write(self.s3, bucket, object_name, self._get_schema_html(), public_bucket=True)
def _save_schema_json(self, save_schema): object_name = save_schema.format(table_name=self._tablename, ext="json") bucket = self._parameters["schemas-bucket"] object_storage.write(self.s3, bucket, object_name, json.dumps(self._schema, indent=2), public_bucket=True)
def _parse_doc_protocol(self, committee_id, meeting_id, bucket, protocol_object_name, parts_object_name, text_object_name): logging.info("parsing doc protocol {} --> {}, {}".format(protocol_object_name, parts_object_name, text_object_name)) with object_storage.temp_download(self.s3, bucket, protocol_object_name) as protocol_filename: try: with CommitteeMeetingProtocol.get_from_filename(protocol_filename) as protocol: object_storage.write(self.s3, bucket, text_object_name, protocol.text, public_bucket=True) self._parse_protocol_parts(bucket, parts_object_name, protocol) except ( AntiwordException, # see https://github.com/hasadna/knesset-data-pipelines/issues/15 subprocess.SubprocessError, xml.etree.ElementTree.ParseError # see https://github.com/hasadna/knesset-data-pipelines/issues/32 ): logging.exception("committee {} meeting {}: failed to parse doc file, skipping".format(committee_id, meeting_id)) return False return True
def _save_url(self, url, bucket, object_name, num_retries, seconds_between_retries, retry_num=1): try: res = self._reuqests_get(url) except RequestException as e: if retry_num < num_retries: logging.exception(e) logging.info("retry {} / {}, waiting {} seconds before retrying...".format(retry_num, num_retries, seconds_between_retries)) time.sleep(seconds_between_retries) return self._save_url(url, bucket, object_name, num_retries, seconds_between_retries, retry_num+1) else: raise if res.status_code == 200: object_storage.write(self.s3, bucket, object_name, res.content, public_bucket=True) return True else: return False
def filter_resources(datapackage, resources, parameters, stats): tables = [] for resource_descriptor, resource_data in zip(datapackage["resources"], resources): schema = resource_descriptor["schema"] stats[resource_descriptor["name"]] = 0 tables.append(_get_schema_table(resource_descriptor["name"], schema["fields"], schema["primaryKey"])) yield filter_resource(resource_descriptor, resource_data, stats) html = """<html><head><meta charset="UTF-8"></head><body>{tables}</body></html>""".format(tables="".join(tables)) save_schema = parameters.get("save-schema", DEFAULT_SAVE_SCHEMA) if save_schema: save_schema_html = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="html") save_schema_json = DEFAULT_SAVE_SCHEMA.format(table_name=datapackage["name"], ext="json") s3 = object_storage.get_s3() object_storage.write(s3, parameters["bucket"], save_schema_html, html, public_bucket=True) object_storage.write(s3, parameters["bucket"], save_schema_json, json.dumps(datapackage["resources"], indent=2, ensure_ascii=False), public_bucket=True)