import sys from harvester.post_processing.run_transform_on_couchdb_docs import run_on_couchdb_by_collection def fix_lapl_isShownBy(doc): '''Some urls are broken but fixable.''' if 'sola1' in doc.get('isShownBy', ''): print 'Fixing {}'.format(doc['_id']) doc['isShownBy'] = doc['isShownBy'].replace('sola1', '00000') return doc else: return None run_on_couchdb_by_collection(fix_lapl_isShownBy, collection_key="26094")
if 'src' in thumb: x = thumb['X'] best_image = thumb['src'] ref_images = doc['originalRecord'].get('reference-image', []) if type(ref_images) == dict: ref_images = [ref_images] for obj in ref_images: if int(obj['X']) > x: x = int(obj['X']) best_image = obj['src'] if best_image and not best_image.startswith('http'): best_image = '/'.join((URL_OAC_CONTENT_BASE, best_image)) return best_image url_content_base = 'http://content.cdlib.org/' def fix_isShownBy_11747(doc): doc_ark = doc['isShownAt'].split('ark:')[1] doc_ark = 'ark:' + doc_ark doc['originalRecord']['thumbnail']['src'] = ''.join( (url_content_base, doc_ark, '/thumbnail')) best_image = get_best_oac_image(doc) doc['isShownBy'] = best_image print "DOC: {} shownBy:{}".format(doc['_id'], doc['isShownBy']) return doc run_on_couchdb_by_collection(fix_isShownBy_11747, collection_key="11747")
), "/query" ) def fill_object_values_from_solr(doc): '''If no object field, try to get from current solr ''' if 'object' not in doc: query='harvest_id_s:"{}"'.format(doc['_id']) msg = "NO OBJECT FOR {}".format(doc['_id']) resp = SOLR(q=query, fields='harvest_id_s, reference_image_md5, id, collection_url, reference_image_dimensions', ) results = resp.results if results: solr_doc = results[0] if 'reference_image_md5' in solr_doc: doc['object'] = solr_doc['reference_image_md5'] doc['object_dimensions'] = solr_doc['reference_image_dimensions'].split(':') print "OBJ DIM:{}".format(doc['object_dimensions']) print 'UPDATING OBJECT -- {}'.format(doc['_id']) return doc else: print 'NOT IN SOLR -- {}'.format(msg) return None run_on_couchdb_by_collection(fill_object_values_from_solr, #collection_key="23066") collection_key="26094")
if thumb: if "src" in thumb: x = thumb["X"] best_image = thumb["src"] ref_images = doc["originalRecord"].get("reference-image", []) if type(ref_images) == dict: ref_images = [ref_images] for obj in ref_images: if int(obj["X"]) > x: x = int(obj["X"]) best_image = obj["src"] if best_image and not best_image.startswith("http"): best_image = "/".join((URL_OAC_CONTENT_BASE, best_image)) return best_image url_content_base = "http://content.cdlib.org/" def fix_isShownBy_11747(doc): doc_ark = doc["isShownAt"].split("ark:")[1] doc_ark = "ark:" + doc_ark doc["originalRecord"]["thumbnail"]["src"] = "".join((url_content_base, doc_ark, "/thumbnail")) best_image = get_best_oac_image(doc) doc["isShownBy"] = best_image print "DOC: {} shownBy:{}".format(doc["_id"], doc["isShownBy"]) return doc run_on_couchdb_by_collection(fix_isShownBy_11747, collection_key="11747")
###"24613", ###"24760", ###"2487", ###"24981", ###"25043", ###"25149", ###"2515", ###"25152", ###"25158", ###"25236", ###"25252", ###"25267", ###"25321", ###"2545", ###"25471", ###"25496", ###"25500", ###"25503", ###"25507", ###"25529", ###"25597", ###"2600", ###"26094", for c in collection_ids: print "RUN FOR {}".format(c) sys.stderr.flush() sys.stdout.flush() run_on_couchdb_by_collection(add_rights_and_type_to_collection, collection_key=c)
from harvester.post_processing.run_transform_on_couchdb_docs import run_on_couchdb_by_collection # pass in a Couchdb doc, get back one with de-duplicated sourceResource values def dedupe_sourceresource(doc): ''' Look for duplicate values in the doc['sourceResource'] and remove. Values must be *exactly* the same ''' for key, value in doc['sourceResource'].items(): if isinstance(value, list): # can't use set() because of dict values (non-hashable) new_list = [] for item in value: if item not in new_list: new_list.append(item) doc['sourceResource'][key] = new_list return doc if __name__=='__main__': doc_ids = run_on_couchdb_by_collection(dedup_sourceresource) print "NUMBER OF DOCS DEDUPED:{}".format(len(doc_ids))
###"10664", ###"10671", ###"10710", ###"10722", ###"10732", ###"108", ###"10885", ###"10935", ###"11", ###"11010", ###"11068", ###"11073", ###"11075", ###"11076", ###"11084", ###"11134", ###"11167", ###"11439", ###"11575", ###"11582", ###"1161", ###"11705", ###] for c in collection_ids: print "RUN FOR {}".format(c) sys.stderr.flush() sys.stdout.flush() run_on_couchdb_by_collection(fix_rights_status, collection_key=c)
def fill_object_values_from_solr(doc): '''If no object field, try to get from current solr ''' if 'object' not in doc: query = 'harvest_id_s:"{}"'.format(doc['_id']) msg = "NO OBJECT FOR {}".format(doc['_id']) resp = SOLR( q=query, fields= 'harvest_id_s, reference_image_md5, id, collection_url, reference_image_dimensions', ) results = resp.results if results: solr_doc = results[0] if 'reference_image_md5' in solr_doc: doc['object'] = solr_doc['reference_image_md5'] doc['object_dimensions'] = solr_doc[ 'reference_image_dimensions'].split(':') print "OBJ DIM:{}".format(doc['object_dimensions']) print 'UPDATING OBJECT -- {}'.format(doc['_id']) return doc else: print 'NOT IN SOLR -- {}'.format(msg) return None run_on_couchdb_by_collection( fill_object_values_from_solr, #collection_key="23066") collection_key="26094")
###"10632", ###"10664", ###"10671", ###"10710", ###"10722", ###"10732", ###"108", ###"10885", ###"10935", ###"11", ###"11010", ###"11068", ###"11073", ###"11075", ###"11076", ###"11084", ###"11134", ###"11167", ###"11439", ###"11575", ###"11582", ###"1161", ###"11705", ###] for c in collection_ids: print "RUN FOR {}".format(c) sys.stderr.flush() sys.stdout.flush() run_on_couchdb_by_collection(fix_rights_status, collection_key=c)