Пример #1
0
import sys
from harvester.post_processing.run_transform_on_couchdb_docs import run_on_couchdb_by_collection

def fix_lapl_isShownBy(doc):
    '''Some urls are broken but fixable.'''
    if 'sola1' in doc.get('isShownBy', ''):
        print 'Fixing {}'.format(doc['_id'])
        doc['isShownBy'] = doc['isShownBy'].replace('sola1', '00000')
        return doc
    else:
        return None

run_on_couchdb_by_collection(fix_lapl_isShownBy,
        collection_key="26094")
Пример #2
0
            if 'src' in thumb:
                x = thumb['X']
                best_image = thumb['src']
        ref_images = doc['originalRecord'].get('reference-image', [])
        if type(ref_images) == dict:
            ref_images = [ref_images]
        for obj in ref_images:
            if int(obj['X']) > x:
                x = int(obj['X'])
                best_image = obj['src']
        if best_image and not best_image.startswith('http'):
            best_image = '/'.join((URL_OAC_CONTENT_BASE, best_image))
    return best_image


url_content_base = 'http://content.cdlib.org/'


def fix_isShownBy_11747(doc):
    doc_ark = doc['isShownAt'].split('ark:')[1]
    doc_ark = 'ark:' + doc_ark
    doc['originalRecord']['thumbnail']['src'] = ''.join(
        (url_content_base, doc_ark, '/thumbnail'))
    best_image = get_best_oac_image(doc)
    doc['isShownBy'] = best_image
    print "DOC: {} shownBy:{}".format(doc['_id'], doc['isShownBy'])
    return doc


run_on_couchdb_by_collection(fix_isShownBy_11747, collection_key="11747")
        ),
        "/query"
)

def fill_object_values_from_solr(doc):
    '''If no object field, try to get from current solr
    '''
    if 'object' not in doc:
        query='harvest_id_s:"{}"'.format(doc['_id'])
        msg = "NO OBJECT FOR {}".format(doc['_id'])
        resp = SOLR(q=query,
            fields='harvest_id_s, reference_image_md5, id, collection_url, reference_image_dimensions',
                )
        results = resp.results
        if results:
            solr_doc =  results[0]
            if 'reference_image_md5' in solr_doc:
                doc['object'] = solr_doc['reference_image_md5']
                doc['object_dimensions'] = solr_doc['reference_image_dimensions'].split(':')
                print "OBJ DIM:{}".format(doc['object_dimensions'])
                print 'UPDATING OBJECT -- {}'.format(doc['_id'])
                return doc
        else:
            print 'NOT IN SOLR -- {}'.format(msg)
        return None

run_on_couchdb_by_collection(fill_object_values_from_solr,
        #collection_key="23066")
        collection_key="26094")

Пример #4
0
import sys
from harvester.post_processing.run_transform_on_couchdb_docs import run_on_couchdb_by_collection


def fix_lapl_isShownBy(doc):
    '''Some urls are broken but fixable.'''
    if 'sola1' in doc.get('isShownBy', ''):
        print 'Fixing {}'.format(doc['_id'])
        doc['isShownBy'] = doc['isShownBy'].replace('sola1', '00000')
        return doc
    else:
        return None


run_on_couchdb_by_collection(fix_lapl_isShownBy, collection_key="26094")
Пример #5
0
        if thumb:
            if "src" in thumb:
                x = thumb["X"]
                best_image = thumb["src"]
        ref_images = doc["originalRecord"].get("reference-image", [])
        if type(ref_images) == dict:
            ref_images = [ref_images]
        for obj in ref_images:
            if int(obj["X"]) > x:
                x = int(obj["X"])
                best_image = obj["src"]
        if best_image and not best_image.startswith("http"):
            best_image = "/".join((URL_OAC_CONTENT_BASE, best_image))
    return best_image


url_content_base = "http://content.cdlib.org/"


def fix_isShownBy_11747(doc):
    doc_ark = doc["isShownAt"].split("ark:")[1]
    doc_ark = "ark:" + doc_ark
    doc["originalRecord"]["thumbnail"]["src"] = "".join((url_content_base, doc_ark, "/thumbnail"))
    best_image = get_best_oac_image(doc)
    doc["isShownBy"] = best_image
    print "DOC: {} shownBy:{}".format(doc["_id"], doc["isShownBy"])
    return doc


run_on_couchdb_by_collection(fix_isShownBy_11747, collection_key="11747")
###"24613",
###"24760",
###"2487",
###"24981",
###"25043",
###"25149",
###"2515",
###"25152",
###"25158",
###"25236",
###"25252",
###"25267",
###"25321",
###"2545",
###"25471",
###"25496",
###"25500",
###"25503",
###"25507",
###"25529",
###"25597",
###"2600",
###"26094",

for c in collection_ids:
    print "RUN FOR {}".format(c)
    sys.stderr.flush()
    sys.stdout.flush()
    run_on_couchdb_by_collection(add_rights_and_type_to_collection,
        collection_key=c)
Пример #7
0
from harvester.post_processing.run_transform_on_couchdb_docs import run_on_couchdb_by_collection

# pass in a Couchdb doc, get back one with de-duplicated sourceResource values
def dedupe_sourceresource(doc):
    ''' Look for duplicate values in the doc['sourceResource'] and 
    remove.
    Values must be *exactly* the same
    '''
    for key, value in doc['sourceResource'].items():
        if isinstance(value, list):
            # can't use set() because of dict values (non-hashable)
            new_list = []
            for item in value:
                if item not in new_list:
                    new_list.append(item)
            doc['sourceResource'][key] = new_list
    return doc

if __name__=='__main__':
    doc_ids = run_on_couchdb_by_collection(dedup_sourceresource)
    print "NUMBER OF DOCS DEDUPED:{}".format(len(doc_ids))
###"24613",
###"24760",
###"2487",
###"24981",
###"25043",
###"25149",
###"2515",
###"25152",
###"25158",
###"25236",
###"25252",
###"25267",
###"25321",
###"2545",
###"25471",
###"25496",
###"25500",
###"25503",
###"25507",
###"25529",
###"25597",
###"2600",
###"26094",

for c in collection_ids:
    print "RUN FOR {}".format(c)
    sys.stderr.flush()
    sys.stdout.flush()
    run_on_couchdb_by_collection(add_rights_and_type_to_collection,
                                 collection_key=c)
Пример #9
0
###"10664",
###"10671",
###"10710",
###"10722",
###"10732",
###"108",
###"10885",
###"10935",
###"11",
###"11010",
###"11068",
###"11073",
###"11075",
###"11076",
###"11084",
###"11134",
###"11167",
###"11439",
###"11575",
###"11582",
###"1161",
###"11705",
###]

for c in collection_ids:
    print "RUN FOR {}".format(c)
    sys.stderr.flush()
    sys.stdout.flush()
    run_on_couchdb_by_collection(fix_rights_status,
        collection_key=c)
def fill_object_values_from_solr(doc):
    '''If no object field, try to get from current solr
    '''
    if 'object' not in doc:
        query = 'harvest_id_s:"{}"'.format(doc['_id'])
        msg = "NO OBJECT FOR {}".format(doc['_id'])
        resp = SOLR(
            q=query,
            fields=
            'harvest_id_s, reference_image_md5, id, collection_url, reference_image_dimensions',
        )
        results = resp.results
        if results:
            solr_doc = results[0]
            if 'reference_image_md5' in solr_doc:
                doc['object'] = solr_doc['reference_image_md5']
                doc['object_dimensions'] = solr_doc[
                    'reference_image_dimensions'].split(':')
                print "OBJ DIM:{}".format(doc['object_dimensions'])
                print 'UPDATING OBJECT -- {}'.format(doc['_id'])
                return doc
        else:
            print 'NOT IN SOLR -- {}'.format(msg)
        return None


run_on_couchdb_by_collection(
    fill_object_values_from_solr,
    #collection_key="23066")
    collection_key="26094")
Пример #11
0
###"10632",
###"10664",
###"10671",
###"10710",
###"10722",
###"10732",
###"108",
###"10885",
###"10935",
###"11",
###"11010",
###"11068",
###"11073",
###"11075",
###"11076",
###"11084",
###"11134",
###"11167",
###"11439",
###"11575",
###"11582",
###"1161",
###"11705",
###]

for c in collection_ids:
    print "RUN FOR {}".format(c)
    sys.stderr.flush()
    sys.stdout.flush()
    run_on_couchdb_by_collection(fix_rights_status, collection_key=c)