def main():
    recids = search_pattern(p='arxiv', f='reportnumber')
    to_process = []

    for count, recid in enumerate(recids):
        if count % 50 == 0:
            print 'done %s of %s' % (count, len(recids))

        if not record_has_fulltext(recid):
            print 'adding', recid
            to_process.append(recid)

        if len(to_process) == 1000:
            task_id = submit_task(to_process)
            print 'submitted task id %s' % task_id
            wait_for_task(task_id)
            to_process = []

    if to_process:
        task_id = submit_task(to_process)
        print 'submitted final task id %s' % task_id
Пример #2
0
def process_one(recid):
    """Checks given recid for updated pdfs on arxiv"""
    write_message('checking %s' % recid)

    # Last version we have harvested
    harvest_status, harvest_version = fetch_arxiv_pdf_status(recid)

    # Fetch arxiv version
    arxiv_version = fetch_arxiv_version(recid)
    if not arxiv_version:
        msg = 'version information unavailable'
        write_message(msg)
        raise PdfNotAvailable(msg)

    write_message('harvested_version %s' % harvest_version)
    write_message('arxiv_version %s' % arxiv_version)

    if record_has_fulltext(recid) and harvest_version == arxiv_version:
        write_message('our version matches arxiv')
        raise AlreadyHarvested(status=harvest_status)

    # We already tried to harvest this record but failed
    if harvest_status == STATUS_MISSING and harvest_version == arxiv_version:
        raise PdfNotAvailable()

    updated = False

    try:
        download_one(recid, arxiv_version)
    except PdfNotAvailable:
        store_arxiv_pdf_status(recid, STATUS_MISSING, arxiv_version)
        raise
    except FoundExistingPdf:
        store_arxiv_pdf_status(recid, STATUS_OK, arxiv_version)
        raise
    else:
        store_arxiv_pdf_status(recid, STATUS_OK, arxiv_version)
        updated = True

    return updated
Пример #3
0
def process_one(recid):
    """Checks given recid for updated pdfs on arxiv"""
    write_message('checking %s' % recid)

    # Last version we have harvested
    harvest_status, harvest_version = fetch_arxiv_pdf_status(recid)

    # Fetch arxiv version
    arxiv_version = fetch_arxiv_version(recid)
    if not arxiv_version:
        msg = 'version information unavailable'
        write_message(msg)
        raise PdfNotAvailable(msg)

    write_message('harvested_version %s' % harvest_version)
    write_message('arxiv_version %s' % arxiv_version)

    if record_has_fulltext(recid) and harvest_version == arxiv_version:
        write_message('our version matches arxiv')
        raise AlreadyHarvested(status=harvest_status)

    # We already tried to harvest this record but failed
    if harvest_status == STATUS_MISSING and harvest_version == arxiv_version:
        raise PdfNotAvailable()

    updated = False

    try:
        download_one(recid, arxiv_version)
    except PdfNotAvailable:
        store_arxiv_pdf_status(recid, STATUS_MISSING, arxiv_version)
        raise
    except FoundExistingPdf:
        store_arxiv_pdf_status(recid, STATUS_OK, arxiv_version)
        raise
    else:
        store_arxiv_pdf_status(recid, STATUS_OK, arxiv_version)
        updated = True

    return updated
 def cb_process_one(recid):
     record = get_record(recid)
     if record.find_fields('999C5') or record.find_fields('999C6'):
         return
     if record_has_fulltext(recid):
         refextract.add(recid)