def get_ia_db(configfile=None): """Metadata API is slow. Talk to archive.org database directly if it is specified in the global configuration or if a configfile is provided. """ if configfile: from openlibrary.config import load_config load_config(configfile) if not config.get("ia_db"): return None global _ia_db if not _ia_db: settings = config.ia_db host = settings['host'] db = settings['db'] user = settings['user'] pw = os.popen(settings['pw_file']).read().strip() _ia_db = web.database(dbn="postgres", host=host, db=db, user=user, pw=pw) return _ia_db
def main(ol_config: str): load_config(ol_config) # Partner data is offset ~15 days from start of month date = datetime.date.today() - timedelta(days=15) batch_name = "%s-%04d%02d" % ('bwb', date.year, date.month) batch = Batch.find(batch_name) or Batch.new(batch_name) batch_import(sys.argv[1], batch)
def main(): load_config( os.path.abspath( os.path.join(os.sep, 'olsystem', 'etc', 'openlibrary.yml'))) # Partner data is offset ~15 days from start of month date = datetime.date.today() - timedelta(days=15) batch_name = "%s-%04d%02d" % ('bwb', date.year, date.month) batch = Batch.find(batch_name) or Batch.new(batch_name) batch_import(sys.argv[1], batch)
def connect_to_couch(config_file): "Connects to the couch databases" load_config(config_file) infogami._setup() f = open(config_file) config = yaml.load(f) f.close() admin_db = config["admin"]["counts_db"] return couchdb.Database(admin_db)
def main(): '''Command Line interface for search in the OL database and update the solr's search index.''' global options options = parse_options() if not config.runtime_config: config.load(options.config) config.load_config(options.config) if (options.daemon == True): start_daemon() else: scan_days()
def import_job( ol_config: str, dry_run=False, ) -> None: """ :param ol_config: Path to openlibrary.yml file :param dry_run: If true, only print out records to import """ load_config(ol_config) # Make HEAD request to get last-modified time last_modified = find_last_updated() if not last_modified: print( f'HEAD request to {FEED_URL} failed. Not attempting GET request.') return print(f'Last-Modified date: {last_modified}') updated_on = get_last_updated_time() if last_modified == updated_on: print(f'No new updates since {updated_on}. Processing completed.') return print(f'Last import job: {updated_on or "No date found"}') # Get feed: d = get_feed() # Create datetime using updated_on: modified_since = convert_date_string(updated_on) # Map feed entries to list of import objects: print( f'Importing all entries that have been updated since {modified_since}.' ) modified_entries = filter_modified_since(d.entries, modified_since) print(f'{len(modified_entries)} import objects created.') if not dry_run: create_batch(modified_entries) print( f'{len(modified_entries)} entries added to the batch import job.') else: for record in modified_entries: print(json.dumps(record)) # Store timestamp for header if not dry_run: with open(LAST_UPDATED_TIME, 'w+') as f: f.write(last_modified) print(f'Last updated timestamp written to: {LAST_UPDATED_TIME}')
def main(): if "--config" in sys.argv: index = sys.argv.index("--config") configfile = sys.argv[index + 1] del sys.argv[index:index + 2] else: import os configfile = os.path.abspath( os.path.join( os.path.dirname(__file__), os.pardir, os.pardir, 'openlibrary', 'conf', 'openlibrary.yml', )) load_config(configfile) from infogami import config cmd = sys.argv[1] args, flags = [], { 'servername': config.get('servername', 'https://openlibrary.org') } for i in sys.argv[2:]: if i.startswith('--'): flags[i[2:]] = True else: args.append(i) if cmd == "import-retro": start, stop = ((int(a) for a in args) if (args and len(args) == 2) else (None, None)) return retroactive_import(start=start, stop=stop, servername=flags['servername']) if cmd == "import-ocaids": return import_ocaids(*args, **flags) if cmd == "add-items": return add_items(*args) elif cmd == "add-new-scans": return add_new_scans(args) elif cmd == "import-batch": return import_batch(args, **flags) elif cmd == "import-all": return import_all(args, **flags) elif cmd == "import-item": return import_item(args, **flags) else: logger.error("Unknown command: %s", cmd)
def main(): global args FORMAT = "%(asctime)-15s %(levelname)s %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) logger.info("BEGIN new-solr-updater") args = parse_arguments() process_args(args) # set OL URL when running on a dev-instance if args.ol_url: host = web.lstrips(args.ol_url, "http://").strip("/") update_work.set_query_host(host) logger.info(str(args)) logger.info("loading config from %s", args.config) load_config(args.config) state_file = args.state_file offset = read_state_file(state_file) logfile = InfobaseLog(config.get('infobase_server'), exclude=args.exclude_edits_containing) logfile.seek(offset) solr = Solr() while True: records = logfile.read_records() keys = parse_log(records) count = update_keys(keys) if logfile.tell() != offset: offset = logfile.tell() logger.info("saving offset %s", offset) with open(state_file, "w") as f: f.write(offset) if COMMIT: solr.commit(ndocs=count) else: logger.info("not doing solr commit as commit is off") # don't sleep after committing some records. # While the commit was on, some more edits might have happened. if count == 0: logger.debug("No more log records available, sleeping...") time.sleep(5)
def main(): global args FORMAT = "%(asctime)-15s %(levelname)s %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) logger.info("BEGIN new-solr-updater") args = parse_arguments() process_args(args) # set OL URL when running on a dev-instance if args.ol_url: host = web.lstrips(args.ol_url, "http://").strip("/") update_work.set_query_host(host) print(str(args)) logger.info("loading config from %s", args.config) load_config(args.config) state_file = args.state_file offset = read_state_file(state_file) logfile = InfobaseLog(config.get('infobase_server')) logfile.seek(offset) solr = Solr() while True: records = logfile.read_records() keys = parse_log(records) count = update_keys(keys) if logfile.tell() != offset: offset = logfile.tell() logger.info("saving offset %s", offset) with open(state_file, "w") as f: f.write(offset) if COMMIT: solr.commit(ndocs=count) else: logger.info("not doing solr commit as commit is off") # don't sleep after committing some records. # While the commit was on, some more edits might have happened. if count == 0: logger.debug("No more log records available, sleeping...") time.sleep(5)
def main(config, start, end): """ Get the unique visitors per day between the 2 dates (inclusive) and store them in the infogami database. Ignores errors :param datetime start: :param datetime end: :return: """ load_config(config) # loads config for psql db under the hood infogami._setup() current = start while current <= end: try: count = count_unique_ips_for_day(current) store_data(dict(visitors=count), current) except IndexError as e: print(e.message) current += timedelta(days=1)
def main(): if "--config" in sys.argv: index = sys.argv.index("--config") configfile = sys.argv[index+1] del sys.argv[index:index+2] else: configfile = "openlibrary.yml" load_config(configfile) cmd = sys.argv[1] args = sys.argv[2:] if cmd == "add-items": return add_items(args) elif cmd == "add-new-scans": return add_new_scans(args) elif cmd == "import-batch": return import_batch(args) elif cmd == "import-all": return import_all(args)
async def main( ol_config: str, debugger=False, state_file='solr-update.state', exclude_edits_containing: str = None, ol_url='http://openlibrary.org/', solr_url: str = None, solr_next=False, socket_timeout=10, load_ia_scans=False, commit=True, initial_state: str = None, ): """ :param debugger: Wait for a debugger to attach before beginning :param exclude_edits_containing: Don't index matching edits :param solr_url: If wanting to override what's in the config file :param solr_next: Whether to assume new schema/etc are used :param initial_state: State to use if state file doesn't exist. Defaults to today. """ FORMAT = "%(asctime)-15s %(levelname)s %(message)s" logging.basicConfig(level=logging.INFO, format=FORMAT) logger.info("BEGIN new-solr-updater") if debugger: import debugpy logger.info("Enabling debugger attachment (attach if it hangs here)") debugpy.listen(address=('0.0.0.0', 3000)) logger.info("Waiting for debugger to attach...") debugpy.wait_for_client() logger.info("Debugger attached to port 3000") # Sometimes archive.org requests blocks forever. # Setting a timeout will make the request fail instead of waiting forever. socket.setdefaulttimeout(socket_timeout) # set OL URL when running on a dev-instance if ol_url: host = web.lstrips(ol_url, "http://").strip("/") update_work.set_query_host(host) if solr_url: update_work.set_solr_base_url(solr_url) update_work.set_solr_next(solr_next) logger.info("loading config from %s", ol_config) load_config(ol_config) offset = read_state_file(state_file, initial_state) logfile = InfobaseLog(config.get('infobase_server'), exclude=exclude_edits_containing) logfile.seek(offset) solr = Solr() while True: records = logfile.read_records() keys = parse_log(records, load_ia_scans) count = await update_keys(keys) if logfile.tell() != offset: offset = logfile.tell() logger.info("saving offset %s", offset) with open(state_file, "w") as f: f.write(offset) if commit: solr.commit(ndocs=count) else: logger.info("not doing solr commit as commit is off") # don't sleep after committing some records. # While the commit was on, some more edits might have happened. if count == 0: logger.debug("No more log records available, sleeping...") time.sleep(5)
def load_config(path): logger.info("loading config from %s", path) print "***load_config", path config.load(path) config.load_config(path) return config.runtime_config
func = { "cdump": generate_cdump, "dump": generate_dump, "idump": generate_idump, "sort": sort_dump, "split": split_dump, "index": make_index, "sitemaps": generate_sitemaps, "htmlindex": generate_html_index, }.get(cmd) if func: func(*args, **kwargs) elif cmd == "solrdump": from openlibrary.data import solr # noqa: E402 avoid circular import solr.generate_dump(*args, **kwargs) else: logger.error(f"Unknown command: {cmd}") print("Unknown command:", cmd, file=sys.stderr) if __name__ == "__main__": ol_config = os.getenv("OL_CONFIG") if ol_config: logger.info(f"loading config from {ol_config}") load_config(ol_config) sentry = Sentry(getattr(config, 'sentry_cron_jobs', {})) if sentry.enabled: sentry.init() main(sys.argv[1], sys.argv[2:])