예제 #1
0
def import_ocaids(*ocaids, **kwargs):
    """This method is mostly for testing. It allows you to import one more
    archive.org items into Open Library by ocaid

    Usage:
        $ sudo -u openlibrary /olsystem/bin/olenv \
            HOME=/home/openlibrary OPENLIBRARY_RCFILE=/olsystem/etc/olrc-importbot \
            python scripts/manage-imports.py \
                --config /olsystem/etc/openlibrary.yml \
                import-all
    """
    servername = kwargs.get('servername', None)
    require_marc = not kwargs.get('no_marc', False)

    date = datetime.date.today()
    if not ocaids:
        raise ValueError("Must provide at least one ocaid")
    batch_name = "import-%s-%04d%02d" % (ocaids[0], date.year, date.month)
    try:
        batch = Batch.new(batch_name)
    except Exception as e:
        logger.info(str(e))
    try:
        batch.add_items(ocaids)
    except Exception:
        logger.info("skipping batch adding, already present")

    for ocaid in ocaids:
        item = ImportItem.find_by_identifier(ocaid)
        if item:
            do_import(item, servername=servername, require_marc=require_marc)
        else:
            logger.error("%s is not found in the import queue", ia_id)
예제 #2
0
def main(ol_config: str):
    load_config(ol_config)

    # Partner data is offset ~15 days from start of month
    date = datetime.date.today() - timedelta(days=15)
    batch_name = "%s-%04d%02d" % ('bwb', date.year, date.month)
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch_import(sys.argv[1], batch)
def main():
    load_config(
        os.path.abspath(
            os.path.join(os.sep, 'olsystem', 'etc', 'openlibrary.yml')))
    # Partner data is offset ~15 days from start of month
    date = datetime.date.today() - timedelta(days=15)
    batch_name = "%s-%04d%02d" % ('bwb', date.year, date.month)
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch_import(sys.argv[1], batch)
예제 #4
0
def create_batch(records: list[dict[str, str]]) -> None:
    """Creates Standard Ebook batch import job.

    Attempts to find existing Standard Ebooks import batch.
    If nothing is found, a new batch is created. All of the
    given import records are added to the batch job as JSON strings.
    """
    now = time.gmtime(time.time())
    batch_name = f'standardebooks-{now.tm_year}{now.tm_mon}'
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch.add_items([{
        'ia_id': r['source_records'][0],
        'data': json.dumps(r)
    } for r in records])
예제 #5
0
def add_new_scans(args):
    """Adds new scans from yesterday."""
    if args:
        datestr = args[0]
        yyyy, mm, dd = datestr.split("-")
        date = datetime.date(int(yyyy), int(mm), int(dd))
    else:
        # yesterday
        date = datetime.date.today() - datetime.timedelta(days=1)

    items = get_candidate_ocaids(since_date=date)
    batch_name = "new-scans-%04d%02d" % (date.year, date.month)
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch.add_items(items)
예제 #6
0
def retroactive_import(start=None, stop=None, servername=None):
    """Retroactively searches and imports all previously missed books
    (through all time) in the Archive.org database which were
    created after scribe3 was released (when we switched repub states
    from 4 to [19, 20, 22]).
    """
    scribe3_repub_states = [19, 20, 22]
    items = get_candidate_ocaids(
        scanned_within_days=None, repub_states=scribe3_repub_states)[start:stop]
    date = datetime.date.today()
    batch_name = "new-scans-%04d%02d" % (date.year, date.month)
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch.add_items(items)
    for item in batch.get_items():
        do_import(item, servername=servername)
예제 #7
0
def add_new_scans(args):
    """Adds new scans from yesterday.
    """
    if args:
        datestr = args[0]
        yyyy, mm, dd = datestr.split("-")
        date = datetime.date(int(yyyy), int(mm), int(dd))
    else:
        # yesterday
        date = datetime.date.today() - datetime.timedelta(days=1)

    c1 = '%opensource%'
    c2 = '%additional_collections%'

    # Find all scans which are updated/added on the given date 
    # and have been scanned at most 2 months ago
    q = ("SELECT identifier FROM metadata" +
        " WHERE repub_state=4" +
        "   AND mediatype='texts'" +
        "   AND scancenter IS NOT NULL" +
        "   AND collection NOT LIKE $c1" +
        "   AND collection NOT LIKE $c2" + 
        "   AND (curatestate IS NULL OR curatestate != 'dark')" +
        "   AND lower(format) LIKE '%%pdf%%' AND lower(format) LIKE '%%marc%%'" +
        "   AND scandate is NOT NULL AND scandate > $min_scandate" +
        "   AND updated > $date AND updated < ($date::date + INTERVAL '1' DAY)")

    min_scandate = date - datetime.timedelta(60) # 2 months ago
    result = get_ia_db().query(q, vars=dict(
        c1=c1, 
        c2=c2, 
        date=date.isoformat(),
        min_scandate=min_scandate.strftime("%Y%m%d")))
    items = [row.identifier for row in result]    
    batch_name = "new-scans-%04d%02d" % (date.year, date.month)
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch.add_items(items)
예제 #8
0
def add_new_scans(args):
    """Adds new scans from yesterday.
    """
    if args:
        datestr = args[0]
        yyyy, mm, dd = datestr.split("-")
        date = datetime.date(int(yyyy), int(mm), int(dd))
    else:
        # yesterday
        date = datetime.date.today() - datetime.timedelta(days=1)

    c1 = '%opensource%'
    c2 = '%additional_collections%'

    # Find all scans which are updated/added on the given date
    # and have been scanned at most 2 months ago
    q = (
        "SELECT identifier FROM metadata" + " WHERE repub_state=4" +
        "   AND mediatype='texts'" + "   AND scancenter IS NOT NULL" +
        "   AND collection NOT LIKE $c1" + "   AND collection NOT LIKE $c2" +
        "   AND (curatestate IS NULL OR curatestate != 'dark')" +
        "   AND lower(format) LIKE '%%pdf%%' AND lower(format) LIKE '%%marc%%'"
        + "   AND scandate is NOT NULL AND scandate > $min_scandate" +
        "   AND updated > $date AND updated < ($date::date + INTERVAL '1' DAY)"
    )

    min_scandate = date - datetime.timedelta(60)  # 2 months ago
    result = get_ia_db().query(
        q,
        vars=dict(c1=c1,
                  c2=c2,
                  date=date.isoformat(),
                  min_scandate=min_scandate.strftime("%Y%m%d")))
    items = [row.identifier for row in result]
    batch_name = "new-scans-%04d%02d" % (date.year, date.month)
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch.add_items(items)
예제 #9
0
def add_items(args):
    batch_name = args[0]
    filename = args[1]

    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch.load_items(filename)
예제 #10
0
def add_items(batch_name, filename):
    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch.load_items(filename)
예제 #11
0
def add_items(args):
    batch_name = args[0]
    filename = args[1]

    batch = Batch.find(batch_name) or Batch.new(batch_name)
    batch.load_items(filename)