def import_ocaids(*ocaids, **kwargs): """This method is mostly for testing. It allows you to import one more archive.org items into Open Library by ocaid Usage: $ sudo -u openlibrary /olsystem/bin/olenv \ HOME=/home/openlibrary OPENLIBRARY_RCFILE=/olsystem/etc/olrc-importbot \ python scripts/manage-imports.py \ --config /olsystem/etc/openlibrary.yml \ import-all """ servername = kwargs.get('servername', None) require_marc = not kwargs.get('no_marc', False) date = datetime.date.today() if not ocaids: raise ValueError("Must provide at least one ocaid") batch_name = "import-%s-%04d%02d" % (ocaids[0], date.year, date.month) try: batch = Batch.new(batch_name) except Exception as e: logger.info(str(e)) try: batch.add_items(ocaids) except Exception: logger.info("skipping batch adding, already present") for ocaid in ocaids: item = ImportItem.find_by_identifier(ocaid) if item: do_import(item, servername=servername, require_marc=require_marc) else: logger.error("%s is not found in the import queue", ia_id)
def main(ol_config: str): load_config(ol_config) # Partner data is offset ~15 days from start of month date = datetime.date.today() - timedelta(days=15) batch_name = "%s-%04d%02d" % ('bwb', date.year, date.month) batch = Batch.find(batch_name) or Batch.new(batch_name) batch_import(sys.argv[1], batch)
def main(): load_config( os.path.abspath( os.path.join(os.sep, 'olsystem', 'etc', 'openlibrary.yml'))) # Partner data is offset ~15 days from start of month date = datetime.date.today() - timedelta(days=15) batch_name = "%s-%04d%02d" % ('bwb', date.year, date.month) batch = Batch.find(batch_name) or Batch.new(batch_name) batch_import(sys.argv[1], batch)
def create_batch(records: list[dict[str, str]]) -> None: """Creates Standard Ebook batch import job. Attempts to find existing Standard Ebooks import batch. If nothing is found, a new batch is created. All of the given import records are added to the batch job as JSON strings. """ now = time.gmtime(time.time()) batch_name = f'standardebooks-{now.tm_year}{now.tm_mon}' batch = Batch.find(batch_name) or Batch.new(batch_name) batch.add_items([{ 'ia_id': r['source_records'][0], 'data': json.dumps(r) } for r in records])
def add_new_scans(args): """Adds new scans from yesterday.""" if args: datestr = args[0] yyyy, mm, dd = datestr.split("-") date = datetime.date(int(yyyy), int(mm), int(dd)) else: # yesterday date = datetime.date.today() - datetime.timedelta(days=1) items = get_candidate_ocaids(since_date=date) batch_name = "new-scans-%04d%02d" % (date.year, date.month) batch = Batch.find(batch_name) or Batch.new(batch_name) batch.add_items(items)
def retroactive_import(start=None, stop=None, servername=None): """Retroactively searches and imports all previously missed books (through all time) in the Archive.org database which were created after scribe3 was released (when we switched repub states from 4 to [19, 20, 22]). """ scribe3_repub_states = [19, 20, 22] items = get_candidate_ocaids( scanned_within_days=None, repub_states=scribe3_repub_states)[start:stop] date = datetime.date.today() batch_name = "new-scans-%04d%02d" % (date.year, date.month) batch = Batch.find(batch_name) or Batch.new(batch_name) batch.add_items(items) for item in batch.get_items(): do_import(item, servername=servername)
def add_new_scans(args): """Adds new scans from yesterday. """ if args: datestr = args[0] yyyy, mm, dd = datestr.split("-") date = datetime.date(int(yyyy), int(mm), int(dd)) else: # yesterday date = datetime.date.today() - datetime.timedelta(days=1) c1 = '%opensource%' c2 = '%additional_collections%' # Find all scans which are updated/added on the given date # and have been scanned at most 2 months ago q = ("SELECT identifier FROM metadata" + " WHERE repub_state=4" + " AND mediatype='texts'" + " AND scancenter IS NOT NULL" + " AND collection NOT LIKE $c1" + " AND collection NOT LIKE $c2" + " AND (curatestate IS NULL OR curatestate != 'dark')" + " AND lower(format) LIKE '%%pdf%%' AND lower(format) LIKE '%%marc%%'" + " AND scandate is NOT NULL AND scandate > $min_scandate" + " AND updated > $date AND updated < ($date::date + INTERVAL '1' DAY)") min_scandate = date - datetime.timedelta(60) # 2 months ago result = get_ia_db().query(q, vars=dict( c1=c1, c2=c2, date=date.isoformat(), min_scandate=min_scandate.strftime("%Y%m%d"))) items = [row.identifier for row in result] batch_name = "new-scans-%04d%02d" % (date.year, date.month) batch = Batch.find(batch_name) or Batch.new(batch_name) batch.add_items(items)
def add_new_scans(args): """Adds new scans from yesterday. """ if args: datestr = args[0] yyyy, mm, dd = datestr.split("-") date = datetime.date(int(yyyy), int(mm), int(dd)) else: # yesterday date = datetime.date.today() - datetime.timedelta(days=1) c1 = '%opensource%' c2 = '%additional_collections%' # Find all scans which are updated/added on the given date # and have been scanned at most 2 months ago q = ( "SELECT identifier FROM metadata" + " WHERE repub_state=4" + " AND mediatype='texts'" + " AND scancenter IS NOT NULL" + " AND collection NOT LIKE $c1" + " AND collection NOT LIKE $c2" + " AND (curatestate IS NULL OR curatestate != 'dark')" + " AND lower(format) LIKE '%%pdf%%' AND lower(format) LIKE '%%marc%%'" + " AND scandate is NOT NULL AND scandate > $min_scandate" + " AND updated > $date AND updated < ($date::date + INTERVAL '1' DAY)" ) min_scandate = date - datetime.timedelta(60) # 2 months ago result = get_ia_db().query( q, vars=dict(c1=c1, c2=c2, date=date.isoformat(), min_scandate=min_scandate.strftime("%Y%m%d"))) items = [row.identifier for row in result] batch_name = "new-scans-%04d%02d" % (date.year, date.month) batch = Batch.find(batch_name) or Batch.new(batch_name) batch.add_items(items)
def add_items(args): batch_name = args[0] filename = args[1] batch = Batch.find(batch_name) or Batch.new(batch_name) batch.load_items(filename)
def add_items(batch_name, filename): batch = Batch.find(batch_name) or Batch.new(batch_name) batch.load_items(filename)