示例#1
0
async def initialize_database(
    engine: AsyncEngine,
    logger: BoundLogger,
    *,
    schema: MetaData,
    reset: bool = False,
) -> None:
    """Create and initialize a new database.

    Parameters
    ----------
    engine : `sqlalchemy.ext.asyncio.AsyncEngine`
        Database engine to use.  Create with `create_database_engine`.
    logger : ``structlog.stdlib.BoundLogger``
        Logger used to report problems
    schema : `sqlalchemy.sql.schema.MetaData`
        Metadata for the database schema.  Generally this will be
        ``Base.metadata`` where ``Base`` is the declarative base used as the
        base class for all ORM table definitions.  The caller must ensure that
        all table definitions have been imported by Python before calling this
        function, or parts of the schema will be missing.
    reset : `bool`, optional
        If set to `True`, drop all tables and reprovision the database.
        Useful when running tests with an external database.  Default is
        `False`.

    Raises
    ------
    DatabaseInitializationError
        After five attempts, the database still could not be initialized.
        This is normally due to some connectivity issue to the database.
    """
    success = False
    error = None
    for _ in range(5):
        try:
            async with engine.begin() as conn:
                if reset:
                    await conn.run_sync(schema.drop_all)
                await conn.run_sync(schema.create_all)
            success = True
        except (ConnectionRefusedError, OperationalError, OSError) as e:
            logger.info("database not ready, waiting two seconds")
            error = str(e)
            await asyncio.sleep(2)
            continue
        if success:
            logger.info("initialized database schema")
            break
    if not success:
        msg = "database schema initialization failed (database not reachable?)"
        logger.error(msg)
        await engine.dispose()
        raise DatabaseInitializationError(error)
示例#2
0
def _clear_common_values(log: BoundLogger, existing_df, data_source,
                         index_fields, column_to_fill):
    """For index labels shared between existing_df and data_source, clear column_to_fill in existing_df.

    existing_df is modified inplace. Index labels (the values in the index for one row) do not need to be unique in a
    table.
    """
    existing_df.set_index(index_fields, inplace=True)
    data_source.set_index(index_fields, inplace=True)
    common_labels_without_date = existing_df.index.intersection(
        data_source.index)
    if not common_labels_without_date.empty:
        # Maybe only do this for rows with some value in column_to_fill.
        existing_df.sort_index(inplace=True, sort_remaining=True)
        existing_df.loc[common_labels_without_date, [column_to_fill]] = None
        log.error(
            "Duplicate timeseries data",
            common_labels=common_labels_without_date.to_frame(
                index=False).to_dict(orient="records"),
        )
    existing_df.reset_index(inplace=True)
    data_source.reset_index(inplace=True)
示例#3
0
async def ingest_ltd_lander_jsonld_document(
    *,
    app: web.Application,
    logger: BoundLogger,
    url_ingest_message: Dict[str, Any],
) -> None:
    """Run the Algolia ingest of a LTD_LANDER_JSONLD content type.

    Parameters
    ----------
    app : `aiohttp.web.Application`
        The app.
    logger
        A structlog logger that is bound with context about the Kafka message.
    url_ingest_message : `dict`
        The deserialized value of the Kafka message.
    """
    logger = logger.bind(
        content_url=url_ingest_message["url"],
        content_type=url_ingest_message["content_type"],
    )
    logger.info("Starting LTD_LANDER_JSONLD ingest")

    http_session = app["safir/http_session"]

    edition_data = await get_json_data(
        url=url_ingest_message["edition"]["url"],
        logger=logger,
        http_session=http_session,
    )

    published_url = edition_data["published_url"]
    jsonld_name = "metadata.jsonld"
    if published_url.endswith("/"):
        jsonld_url = f"{published_url}{jsonld_name}"
    else:
        jsonld_url = f"{published_url}/{jsonld_name}"

    try:
        metadata = await get_json_data(
            url=jsonld_url,
            logger=logger,
            http_session=http_session,
            # by-pass aiohttp's encoding check; the jsonld files do not have
            # correct CONTENT-TYPE headers.
            encoding="utf-8",
            content_type=None,
        )
    except Exception:
        logger.exception("Failure getting metadata.jsonld",
                         jsonld_url=jsonld_url)
        raise

    try:
        reduced_document = ReducedLtdLanderDocument(url=published_url,
                                                    metadata=metadata,
                                                    logger=logger)
    except Exception:
        logger.exception("Failed to build record")
        raise

    surrogate_key = generate_surrogate_key()

    logger.debug("Reduced LTD Lander Document",
                 chunks=len(reduced_document.chunks))

    try:
        records = [
            create_record(
                chunk=s,
                document=reduced_document,
                surrogate_key=surrogate_key,
            ) for s in reduced_document.chunks
        ]

        description_chunk = ContentChunk(
            headers=[reduced_document.h1],
            content=reduced_document.description,
        )
        records.append(
            create_record(
                chunk=description_chunk,
                document=reduced_document,
                surrogate_key=surrogate_key,
            ))
    except Exception:
        logger.exception("Failed to build records")
        raise

    logger.info("Finished building records")

    if app["ook/algolia_search"] is not None:
        try:
            client = app["ook/algolia_search"]
            index = client.init_index(
                app["safir/config"].algolia_document_index_name)
        except Exception:
            logger.exception(
                "Error initializing Algolia index",
                index_name=app["safir/config"].algolia_document_index_name,
            )
            raise

        tasks = [index.save_object_async(record) for record in records]
        try:
            results = await asyncio.gather(*tasks)
            MultipleResponse(results).wait()
        except Exception:
            logger.error("Got algoliasearch request error")
            for record in records:
                logger.debug(json.dumps(record, indent=2, sort_keys=True))

        logger.info("Finished uploading to Algolia")

        await delete_old_records(
            index=index,
            base_url=records[0]["baseUrl"],
            surrogate_key=surrogate_key,
            logger=logger,
        )