Exemplo n.º 1
0
async def main(
    ol_config: str,
    debugger=False,
    state_file='solr-update.state',
    exclude_edits_containing: str = None,
    ol_url='http://openlibrary.org/',
    solr_url: str = None,
    solr_next=False,
    socket_timeout=10,
    load_ia_scans=False,
    commit=True,
    initial_state: str = None,
):
    """
    :param debugger: Wait for a debugger to attach before beginning
    :param exclude_edits_containing: Don't index matching edits
    :param solr_url: If wanting to override what's in the config file
    :param solr_next: Whether to assume new schema/etc are used
    :param initial_state: State to use if state file doesn't exist. Defaults to today.
    """
    FORMAT = "%(asctime)-15s %(levelname)s %(message)s"
    logging.basicConfig(level=logging.INFO, format=FORMAT)
    logger.info("BEGIN new-solr-updater")

    if debugger:
        import debugpy

        logger.info("Enabling debugger attachment (attach if it hangs here)")
        debugpy.listen(address=('0.0.0.0', 3000))
        logger.info("Waiting for debugger to attach...")
        debugpy.wait_for_client()
        logger.info("Debugger attached to port 3000")

    # Sometimes archive.org requests blocks forever.
    # Setting a timeout will make the request fail instead of waiting forever.
    socket.setdefaulttimeout(socket_timeout)

    # set OL URL when running on a dev-instance
    if ol_url:
        host = web.lstrips(ol_url, "http://").strip("/")
        update_work.set_query_host(host)

    if solr_url:
        update_work.set_solr_base_url(solr_url)

    update_work.set_solr_next(solr_next)

    logger.info("loading config from %s", ol_config)
    load_config(ol_config)

    offset = read_state_file(state_file, initial_state)

    logfile = InfobaseLog(config.get('infobase_server'),
                          exclude=exclude_edits_containing)
    logfile.seek(offset)

    solr = Solr()

    while True:
        records = logfile.read_records()
        keys = parse_log(records, load_ia_scans)
        count = await update_keys(keys)

        if logfile.tell() != offset:
            offset = logfile.tell()
            logger.info("saving offset %s", offset)
            with open(state_file, "w") as f:
                f.write(offset)

        if commit:
            solr.commit(ndocs=count)
        else:
            logger.info("not doing solr commit as commit is off")

        # don't sleep after committing some records.
        # While the commit was on, some more edits might have happened.
        if count == 0:
            logger.debug("No more log records available, sleeping...")
            time.sleep(5)
Exemplo n.º 2
0
async def main(
    cmd: Literal['index', 'fetch-end'],
    job: Literal['works', 'orphans', 'authors'],
    postgres="postgres.ini",
    ol="http://ol/",
    ol_config="../../conf/openlibrary.yml",
    solr: str = None,
    skip_solr_id_check=True,
    start_at: str = None,
    offset=0,
    limit=1,
    last_modified: str = None,
    progress: str = None,
    log_file: str = None,
    log_level=logging.INFO,
    dry_run=False,
) -> None:
    """
    :param cmd: Whether to do the index or just fetch end of the chunk
    :param job: Type to index. Orphans gets orphaned editions.
    :param postgres: Path to postgres config file
    :param ol: Open Library endpoint
    :param ol_config: Path to Open Library config file
    :param solr: Overwrite solr base url from ol_config
    :param start_at: key (type-prefixed) to start from as opposed to offset; WAY more
    efficient since offset has to walk through all `offset` rows.
    :param offset: Use `start_at` if possible.
    :param last_modified: Limit results to those modifier >= this date
    :param progress: Where/if to save progress indicator to
    :param log_file: Redirect logs to file instead of stdout
    """

    logging.basicConfig(
        filename=log_file,
        level=log_level,
        format="%(asctime)s [%(levelname)s] %(message)s",
    )

    if solr:
        update_work.set_solr_base_url(solr)

    PLogEntry = namedtuple(
        'PLogEntry',
        [
            'seen',
            'total',
            'percent',
            'elapsed',
            'q_1',
            'q_auth',
            'q_ia',
            'cached',
            'ia_cache',
            'next',
        ],
    )

    class PLog:
        def __init__(self, filename):
            """
            :param str or None filename:
            """
            self.filename = filename
            self.last_entry = None

        def log(self, entry):
            """
            :param PLogEntry entry:
            """
            self.last_entry = entry
            if self.filename:
                with open(progress, 'a') as f:
                    f.write('\t'.join(
                        self.fmt(k, val)
                        for k, val in entry._asdict().items()))
                    f.write('\n')

        def update(
            self,
            seen=None,
            total=None,
            percent=None,
            elapsed=None,
            q_1=None,
            q_auth=None,
            cached=None,
            q_ia=None,
            ia_cache=None,
            next=None,
        ):
            """
            :param str or int or None seen:
            :param str or int or None total:
            :param str or float or None percent:
            :param str or float or None elapsed:
            :param str or float or None q_1:
            :param str or float or None q_auth:
            :param str or int or None cached:
            :param str or float or None q_ia:
            :param str or int or None ia_cache:
            :param str or None next:
            :return: None
            """
            args = locals()
            entry = self.last_entry._replace(
                **
                {f: args[f]
                 for f in PLogEntry._fields if args[f] is not None})
            self.log(entry)

        def fmt(self, k, val):
            """
            :param str k:
            :param Any val:
            :rtype: str
            """
            if val is None:
                return '?'
            if isinstance(val, str):
                return val
            if k == 'percent':
                return '%.2f%%' % (100 * val)
            if k in ['elapsed', 'q_1', 'q_auth', 'q_ia']:
                return '%.2fs' % val
            if isinstance(val, float):
                return '%.2f' % val
            if k == 'next':
                return val.split('/')[-1]
            return str(val)

    plog = PLog(progress)

    # load the contents of the config?
    with LocalPostgresDataProvider(postgres) as db:
        # Check to see where we should be starting from
        if cmd == 'fetch-end':
            next_start_query = build_job_query(job, start_at, limit,
                                               last_modified, 1)
            next_start_results = db.query_all(next_start_query)
            if next_start_results:
                print(next_start_results[0][0])
            return

        logger.info(
            json.dumps({
                'scope': 'solr_builder::main',
                'event': 'Indexing started',
                'start_at': start_at,
            }))
        load_configs(ol, ol_config, db)
        q = build_job_query(job, start_at, offset, last_modified, limit)

        if progress:
            # Clear the file
            with open(progress, 'w') as f:
                f.write('')
            with open(progress, 'a') as f:
                f.write('Calculating total... ')

        start = time.time()
        q_count = """SELECT COUNT(*) FROM(%s) AS foo""" % q
        count = db.query_all(q_count)[0][0]
        end = time.time()

        if progress:
            with open(progress, 'a') as f:
                f.write('%d (%.2fs)\n' % (count, end - start))
                f.write('\t'.join(PLogEntry._fields) + '\n')

        plog.log(
            PLogEntry(0, count, '0.00%', 0, '?', '?', '?', '?', '?', start_at
                      or '?'))
        plog.update(q_1=0, q_auth=0, q_ia=0)

        start = time.time()
        seen = 0
        for batch in db.query_batched(q, size=1000, cache_json=True):
            keys = [x[0] for x in batch]
            plog.update(next=keys[0], cached=len(db.cache), ia_cache=0)

            with LocalPostgresDataProvider(postgres) as db2:
                key_range = [keys[0], keys[-1]]

                if job == "works":
                    # cache editions
                    editions_time, _ = simple_timeit(
                        lambda: db2.cache_work_editions(*key_range))
                    plog.update(
                        q_1=plog.last_entry.q_1 + editions_time,
                        cached=len(db.cache) + len(db2.cache),
                    )

                    # cache editions' ocaid metadata
                    ocaids_time, _ = await simple_timeit_async(
                        db2.cache_cached_editions_ia_metadata())
                    plog.update(
                        q_ia=plog.last_entry.q_ia + ocaids_time,
                        ia_cache=len(db2.ia_cache),
                    )

                    # cache authors
                    authors_time, _ = simple_timeit(
                        lambda: db2.cache_work_authors(*key_range))
                    plog.update(
                        q_auth=plog.last_entry.q_auth + authors_time,
                        cached=len(db.cache) + len(db2.cache),
                    )
                elif job == "orphans":
                    # cache editions' ocaid metadata
                    ocaids_time, _ = await simple_timeit_async(
                        db2.cache_cached_editions_ia_metadata())
                    plog.update(
                        q_ia=plog.last_entry.q_ia + ocaids_time,
                        ia_cache=len(db2.ia_cache),
                    )

                    # cache authors
                    authors_time, _ = simple_timeit(
                        lambda: db2.cache_edition_authors(*key_range))
                    plog.update(
                        q_auth=plog.last_entry.q_auth + authors_time,
                        cached=len(db.cache) + len(db2.cache),
                    )
                elif job == "authors":
                    # Nothing to cache; update_work.py queries solr directly for each
                    # other, and provides no way to cache.
                    pass

                # Store in main cache
                db.cache.update(db2.cache)
                db.ia_cache.update(db2.ia_cache)
                db.cached_work_editions_ranges += db2.cached_work_editions_ranges

            update_keys(
                keys,
                commit=False,
                commit_way_later=True,
                skip_id_check=skip_solr_id_check,
                update='quiet' if dry_run else 'update',
            )

            seen += len(keys)
            plog.update(
                elapsed=time.time() - start,
                seen=seen,
                percent=seen / count,
                cached=len(db.cache),
                ia_cache=len(db.ia_cache),
            )

            db.clear_cache()