def clear_missing(self, verbose=False): conn = Solr(settings.SOLR_URL) start = 0 to_delete = [] pb = None if verbose: print "Checking for indexed records no longer in database" while True: if verbose and pb: pb.update(start) result = conn.search('*:*', sort='id asc', start=start, rows=500, fields=['id']) if not result: break if verbose and not pb: pb = ProgressBar(result.hits) ids = [int(r['id']) for r in result] records = Record.objects.filter(id__in=ids).values_list('id', flat=True) for r in records: ids.remove(r) to_delete.extend(ids) start += 500 if verbose and pb: pb.done() pb = None if verbose and to_delete: print "Removing unneeded records from index" pb = ProgressBar(len(to_delete)) while to_delete: if verbose and pb: pb.update(pb.total - len(to_delete)) conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete[:500]))) to_delete = to_delete[500:] if verbose and pb: pb.done()
def handle(self, *args, **kwargs): coll = kwargs.get('collection') if not coll: print "--collection is a required parameter" return if coll.isdigit(): collection = Collection.objects.get(id=coll) else: collection = Collection.objects.get(name=coll) admins = User.objects.filter(is_superuser=True) if admins: admin = admins[0] else: admin = None pb = ProgressBar(collection.records.count()) for count, record in enumerate(collection.records.all()): get_thumbnail_for_record(record, admin) get_thumbnail_for_record(record, admin, crop_to_square=True) pb.update(count) pb.done()
def handle(self, *args, **kwargs): data_file = kwargs.get('data_file') collections = map(int, kwargs.get('collections') or list()) separator = kwargs.get('separator') fields = list( Field.objects .filter(fieldvalue__record__collection__in=collections) .distinct() ) with open(data_file, 'w') as csvfile: fieldnames = [field.full_name for field in fields] fieldnames.extend(['__file__', '__path__']) writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() records = Record.objects.filter(collection__in=collections) pb = ProgressBar(records.count()) for count, record in enumerate(records): values = record.get_fieldvalues() media = list(record.media_set.select_related('storage').all()) while values or media: row = dict() extra_values = list() for value in values: fieldname = value.field.full_name v = value.value.encode('utf8') if fieldname in row: if not separator: extra_values.append(value) else: row[fieldname] += separator + v else: row[fieldname] = v if media: m = media.pop() row['__file__'] = m.url row['__path__'] = m.storage.base writer.writerow(row) values = extra_values pb.update(count) pb.done()
def refresh(self): count = 0 total = Media.objects.count() pb = ProgressBar(total) for i in range(0, total, 1000): for media in Media.objects.all()[i:i+1000]: media.identify() count += 1 pb.update(count) reset_queries() pb.done()
def handle(self, *args, **kwargs): data_file = kwargs.get('data_file') collections = map(int, kwargs.get('collections') or list()) separator = kwargs.get('separator') fields = list( Field.objects .filter(fieldvalue__record__collection__in=collections) .distinct() ) with open(data_file, 'w') as csvfile: fieldnames = [field.full_name for field in fields] fieldnames.extend(['__file__', '__path__']) writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() records = Record.objects.filter(collection__in=collections) pb = ProgressBar(records.count()) for count, record in enumerate(records): values = record.get_fieldvalues() media = list(record.media_set.select_related('storage').all()) while values or media: row = dict() extra_values = list() for value in values: fieldname = value.field.full_name v = value.value if fieldname in row: if not separator: extra_values.append(value) else: row[fieldname] += separator + v else: row[fieldname] = v if media: m = media.pop() row['__file__'] = m.url row['__path__'] = m.storage.base writer.writerow(row) values = extra_values pb.update(count) pb.done()
def handle(self, *args, **kwargs): updated = 0 pb = ProgressBar(Media.objects.count()) for count, media in enumerate(Media.objects.all()): name = slugify(os.path.splitext(os.path.basename(media.url))[0]) if name != media.name: media.name = name media.save(force_update_name=True) updated += 1 pb.update(count) pb.done() print "Updated %d media objects" % updated
def remove(self): common = self.check() print "Removing unneeded media objects" pb = ProgressBar(len(common)) count = 0 for id in common: m = Media.objects.filter(record__id=id) m.filter(url__startswith='medium\\').delete() m.filter(url__startswith='thumb\\').delete() count += 1 pb.update(count) pb.done()
def handle(self, *prefix, **options): if not prefix: print(self.help) else: count = updated = 0 total = Media.objects.count() pb = ProgressBar(total) for i in range(0, total, 100): for media in Media.objects.all()[i:i + 100]: if media.url.startswith(prefix): media.url = media.url[len(prefix):] media.save() updated += 1 count += 1 pb.update(count) reset_queries() pb.done() print("Updated %d/%d media objects" % (updated, count))
def handle(self, *prefix, **options): if not prefix: print self.help else: count = updated = 0 total = Media.objects.count() pb = ProgressBar(total) for i in range(0, total, 100): for media in Media.objects.all()[i:i+100]: if media.url.startswith(prefix): media.url = media.url[len(prefix):] media.save() updated += 1 count += 1 pb.update(count) reset_queries() pb.done() print "Updated %d/%d media objects" % (updated, count)
def handle(self, *args, **kwargs): updated = 0 id_fields = standardfield('identifier', equiv=True) titles = FieldValue.objects.select_related('record').filter(field__in=id_fields) pb = ProgressBar(titles.count()) for count, title in enumerate(titles): name = slugify(title.value) if name != title.record.name: title.record.name = name title.record.save(force_update_name=True) updated += 1 pb.update(count) pb.done() print "Updated %d record objects" % updated
def handle(self, *args, **kwargs): updated = 0 id_fields = standardfield('identifier', equiv=True) titles = FieldValue.objects.select_related('record').filter( field__in=id_fields) pb = ProgressBar(titles.count()) for count, title in enumerate(titles): name = slugify(title.value) if name != title.record.name: title.record.name = name title.record.save(force_update_name=True) updated += 1 pb.update(count) pb.done() print("Updated %d record objects" % updated)
def handle(self, from_collection, to_collections, commit, *args, **options): if not from_collection or not to_collections: print("Error: Must specify --from and --to arguments") return print("Mapping presentation items from collection %s to " \ "collection(s) %s" % (from_collection, to_collections)) idfields = standardfield_ids('identifier', equiv=True) print("Fetching identifiers") query = FieldValue.objects.filter( field__in=idfields, record__collectionitem__collection=from_collection, owner=None, context_type=None, hidden=False).values_list('value', 'record') record_to_id = dict() for identifier, record in query: record_to_id.setdefault(record, []).append(identifier) print("Fetching target records") query = FieldValue.objects.filter( field__in=idfields, record__collectionitem__collection__in=to_collections, owner=None, context_type=None, hidden=False).values_list('value', 'record') id_to_record = dict() for identifier, record in query: id_to_record.setdefault(identifier, []).append(record) print("Mapping presentation items") remapped = 0 errors = [] items = PresentationItem.objects.filter( record__collectionitem__collection=from_collection) pb = ProgressBar(len(items)) for count, item in enumerate(items): identifiers = record_to_id.get(item.record_id) if identifiers: for identifier in identifiers: new_records = id_to_record.get(identifier) if new_records: if len(new_records) == 1: remapped += 1 if commit: item.record_id = new_records[0] item.save() break else: errors.append( "Multiple matching records with identifier " "'%s' found in collection %s: %s" % (identifier, to_collections, sorted(new_records))) else: errors.append( "No record with identifier '%s' found in " "collection %s" % (identifier, to_collections)) else: errors.append("No identifier found for record %s" % item.record_id) pb.update(count) pb.done() errors = sorted(set(errors)) if commit: print("Remapped %s items" % remapped) else: print("Would have remapped %s items - rerun with --commit" % \ remapped) if errors: print("%s unique errors occurred:" % len(errors)) print('\n'.join(errors))
def index(self, verbose=False, all=False): from models import SolrIndexUpdates self._build_group_tree() conn = Solr(settings.SOLR_URL) core_fields = dict((f, f.get_equivalent_fields()) for f in Field.objects.filter(standard__prefix='dc')) count = 0 batch_size = 500 process_thread = None if all: total_count = Record.objects.count() else: processed_updates = [] to_update = [] to_delete = [] for id,record,delete in SolrIndexUpdates.objects.all()[:batch_size].values_list('id', 'record', 'delete'): processed_updates.append(id) if delete: to_delete.append(record) else: to_update.append(record) if to_delete: conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete))) total_count = len(to_update) if verbose: pb = ProgressBar(total_count) while True: if verbose: pb.update(count) if all: record_ids = Record.objects.all()[count:count + batch_size].values_list('id', flat=True) else: record_ids = Record.objects.filter(id__in=to_update)[count:count + batch_size].values_list('id', flat=True) if not record_ids: break # convert to plain list, because Django's value lists will add a LIMIT clause when used # in an __in query, which causes MySQL to break record_ids = list(record_ids) media_dict = self._preload_related(Media, record_ids) fieldvalue_dict = self._preload_related(FieldValue, record_ids, related=2) groups_dict = self._preload_related(CollectionItem, record_ids) count += len(record_ids) def process_data(groups, fieldvalues, media): def process(): docs = [] for record in Record.objects.filter(id__in=record_ids): docs += [self._record_to_solr(record, core_fields, groups.get(record.id, []), fieldvalues.get(record.id, []), media.get(record.id, []))] conn.add(docs) return process if process_thread: process_thread.join() process_thread = Thread(target=process_data(groups_dict, fieldvalue_dict, media_dict)) process_thread.start() reset_queries() if process_thread: process_thread.join() if verbose: pb.done() if all: SolrIndexUpdates.objects.filter(delete=False).delete() else: SolrIndexUpdates.objects.filter(id__in=processed_updates).delete()
def index(self, verbose=False, all=False, collections=None): from .models import SolrIndexUpdates self._build_group_tree() core_fields = dict( (f, f.get_equivalent_fields()) for f in Field.objects.filter(standard__prefix='dc') ) # add VRA Title to support work titles try: vra_title = Field.objects.get(name='title', standard__prefix='vra') core_fields[vra_title] = vra_title.get_equivalent_fields() except Field.DoesNotExist: pass count = 0 batch_size = 100 process_thread = None if all: query = Record.objects.all() if collections: query = query.filter(collection__in=collections) total_count = query.count() to_update = None to_delete = None else: processed_updates = [] to_update = [] to_delete = [] updates = SolrIndexUpdates.objects.all()[:batch_size].values_list( 'id', 'record', 'delete') for id, record, delete in updates: processed_updates.append(id) if delete: to_delete.append(record) else: to_update.append(record) total_count = len(to_update) if not all and not to_update and not to_delete: logger.info("Nothing to update in index, returning early") return 0 conn = Solr(settings.SOLR_URL) if to_delete: conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete))) primary_work_record_manager = PrimaryWorkRecordManager() if verbose: pb = ProgressBar(total_count) def get_method(method): module, _, function = method.rpartition('.') try: __import__(module) mod = sys.modules[module] return getattr(mod, function) except Exception as ex: logging.debug( "Could not import custom Solr record indexer %s: %s", method, ex) def get_custom_doc_processor(): method = getattr(settings, 'SOLR_RECORD_INDEXER', None) if method: method = get_method(method) return method or (lambda doc, **kwargs: doc) def get_custom_doc_pre_processor(): method = getattr(settings, 'SOLR_RECORD_PRE_INDEXER', None) if method: method = get_method(method) return method or (lambda **kwargs: None) custom_doc_processor = get_custom_doc_processor() custom_doc_pre_processor = get_custom_doc_pre_processor() while True: if verbose: pb.update(count) if all: records = Record.objects.all() if collections: records = records.filter(collection__in=collections) else: records = Record.objects.filter(id__in=to_update) records = records[count:count + batch_size] record_ids = records.values_list('id', flat=True) if not record_ids: break # convert to plain list, because Django's value lists will add a # LIMIT clause when used in an __in query, which causes MySQL to # break. (ph): also, made an explicit separate value for this record_id_list = list(record_ids) media_dict = self._preload_related(Media, record_id_list) fieldvalue_dict = self._preload_related(FieldValue, record_id_list, fields=('field',)) groups_dict = self._preload_related(CollectionItem, record_id_list) image_to_works = self._preload_image_to_works(record_id_list) work_to_images = self._preload_work_to_images(record_id_list) implicit_primary_work_records = primary_work_record_manager \ .get_implicit_primary_work_records(record_id_list) count += len(record_id_list) # VERY IMPORTANT: SINCE process_data RUNS IN ANOTHER THREAD, IT # CANNOT DIRECTLY ACCESS ANY VARIABLES FROM THE OUTER SCOPE # ALWAYS PASS IN ANY NEEDED VARIABLES def process_data(groups, fieldvalues, media, record_id_list, image_to_works, work_to_images, implicit_primary_work_records): def process(): docs = [] for record in Record.objects.filter(id__in=record_id_list): g = groups.get(record.id, []) fv = fieldvalues.get(record.id, []) m = media.get(record.id, []) custom_doc_pre_processor( record=record, core_fields=core_fields, groups=g, fieldvalues=fv, media=m, ) doc = self._record_to_solr( record, core_fields, g, fv, m, image_to_works, work_to_images, implicit_primary_work_records ) doc = custom_doc_processor( doc, record=record, core_fields=core_fields, groups=g, fieldvalues=fv, media=m, ) docs.append(doc) conn.add(docs) return process if process_thread: process_thread.join() process_thread = Thread( target=process_data(groups_dict, fieldvalue_dict, media_dict, record_id_list, image_to_works, work_to_images, implicit_primary_work_records)) process_thread.start() reset_queries() if process_thread: process_thread.join() if verbose: pb.done() if all: # TODO: this will remove objects that have been added # in the meantime SolrIndexUpdates.objects.filter(delete=False).delete() else: SolrIndexUpdates.objects.filter(id__in=processed_updates).delete() return count
if self.preserve_memory: o = ObjectHistory.objects.get(content_type=self.content_type, m2m_content_type=self.m2m_content_type, type=self.type, original_id=oid) # these objects have been deleted since the last migration if not self.m2m_model: self.model.objects.filter(id=o.object_id).delete() else: self.m2m_delete(object_id=o.object_id, m2m_object_id=o.m2m_object_id) logging.debug('%s %s not in source, deleting' % (self.model_name, o.original_id)) self.deleted += 1 o.delete() count += 1 pb.update(count) pb.done() reset_queries() if self.need_instance_map and not self.m2m_model: print "Retrieving instances" ids = dict(ObjectHistory.objects.filter(content_type=self.content_type, m2m_content_type=None, type=self.type).values_list('object_id', 'original_id')) self.instance_map.update((ids.get(o.id, None), o) for o in self.model.objects.all()) self.instance_map.update(merged_ids) print " Added\tReadded\tDeleted\tUpdated\t Unch.\t Merged\t Errors\tNo hist" print "%7d\t%7d\t%7d\t%7d\t%7d\t%7d\t%7d\t%7d" % ( self.added, self.recreated, self.deleted, self.updated, self.unchanged, len(merged_ids), self.errors, self.nohistory )
def index(self, verbose=False, all=False): from models import SolrIndexUpdates self._build_group_tree() core_fields = dict( (f, f.get_equivalent_fields()) for f in Field.objects.filter(standard__prefix='dc')) count = 0 batch_size = 500 process_thread = None if all: total_count = Record.objects.count() to_update = None to_delete = None else: processed_updates = [] to_update = [] to_delete = [] updates = SolrIndexUpdates.objects.all()[:batch_size].values_list( 'id', 'record', 'delete') for id, record, delete in updates: processed_updates.append(id) if delete: to_delete.append(record) else: to_update.append(record) total_count = len(to_update) if not all and not to_update and not to_delete: logger.info("Nothing to update in index, returning early") return 0 conn = Solr(settings.SOLR_URL) if to_delete: conn.delete(q='id:(%s)' % ' '.join(map(str, to_delete))) if verbose: pb = ProgressBar(total_count) while True: if verbose: pb.update(count) if all: records = Record.objects.all() else: records = Record.objects.filter(id__in=to_update) records = records[count:count + batch_size] record_ids = records.values_list('id', flat=True) if not record_ids: break # convert to plain list, because Django's value lists will add a # LIMIT clause when used in an __in query, which causes MySQL to # break. (ph): also, made an explicit separate value for this record_id_list = list(record_ids) media_dict = self._preload_related(Media, record_id_list) fieldvalue_dict = self._preload_related(FieldValue, record_id_list, related=2) groups_dict = self._preload_related(CollectionItem, record_id_list) count += len(record_id_list) def process_data(groups, fieldvalues, media, record_id_list): def process(): docs = [] for record in Record.objects.filter(id__in=record_id_list): docs.append( self._record_to_solr( record, core_fields, groups.get(record.id, []), fieldvalues.get(record.id, []), media.get(record.id, []))) conn.add(docs) return process if process_thread: process_thread.join() process_thread = Thread(target=process_data( groups_dict, fieldvalue_dict, media_dict, record_id_list)) process_thread.start() reset_queries() if process_thread: process_thread.join() if verbose: pb.done() if all: SolrIndexUpdates.objects.filter(delete=False).delete() else: SolrIndexUpdates.objects.filter(id__in=processed_updates).delete() return count
def handle(self, *args, **kwargs): system_field = get_system_field() collections = map(int, kwargs.get('collections') or list()) mapping_file = kwargs.get('mapping_file') if not collections: print "--collection is a required parameter" return if not mapping_file: print "--mapping is a required parameter" return mappings = dict() with open(mapping_file, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: mappings[row['Identifier']] = (row['Work'], row['Primary']) related_field = Field.objects.get( standard__prefix='dc', name='relation', ) existing_works = FieldValue.objects.filter( record__collection__in=collections, field=related_field, refinement='IsPartOf', ) # Clean out old relations print "Deleting old works info" existing_works.delete() id_fields = standardfield_ids('identifier', equiv=True) print "Fetching records" identifiers = FieldValue.objects.select_related('record').filter( record__collection__in=collections, field__in=id_fields, ) pb = ProgressBar(identifiers.count()) # Insert new relations for count, identifier in enumerate(identifiers): work, isprimary = mappings.get(identifier.value, (None, False)) isprimary = isprimary == 'True' if not work: print "Warning: no entry found for identifier '%s'" % \ identifier.value continue FieldValue.objects.create(record=identifier.record, field=related_field, refinement='IsPartOf', value=work, hidden=True) fv = list( FieldValue.objects.filter(record=identifier.record, field=system_field, label='primary-work-record')) if len(fv) > 0: if not isprimary: for f in fv: f.delete() elif isprimary: FieldValue.objects.create( record=identifier.record, field=system_field, label='primary-work-record', value=work, hidden=True, ) pb.update(count) pb.done()
def handle(self, *args, **kwargs): mapping_file = kwargs.get('mapping_file') collections = map(int, kwargs.get('collections') or list()) if not mapping_file or not collections: print "--collection and --mapping are required parameters" return works = dict() with open(mapping_file, 'rU') as mappings: reader = csv.DictReader(mappings) for row in reader: identifier = row['ImageFileName'] work = row['fk_WorkID'] works.setdefault(work, []).append(identifier) # Clean out old relations FieldValue.objects.filter( record__collection__in=collections, field__standard__prefix='dc', field__name='relation', refinement='IsPartOf', ).delete() related_field = Field.objects.get( standard__prefix='dc', name='relation', ) id_fields = standardfield_ids('identifier', equiv=True) print "Caching record identifiers" identifiers = dict() values = FieldValue.objects.select_related('record').filter( record__collection__in=collections, field__in=id_fields) for fv in values: identifiers[fv.value] = fv.record.id pb = ProgressBar(len(works)) # Insert new relations for count, work in enumerate(works.itervalues()): primary = work[0] items = work[1:] for item in items: options = [item] if item.lower().endswith('.jpg'): options.append(item[:-4]) record = None for option in options: record = identifiers.get(option) if record: break else: continue FieldValue.objects.create(record=Record.objects.get(id=record), field=related_field, refinement='IsPartOf', value=primary) pb.update(count) pb.done()
to_delete = [] else: self.m2m_delete(object_id=o.object_id, m2m_object_id=o.m2m_object_id) o.delete() self.deleted += 1 count += 1 if count % 1000 == 0: reset_queries() pb.update(count) if to_delete: self.model.objects.filter( id__in=[d.object_id for d in to_delete]).delete() ObjectHistory.objects.filter( id__in=[d.id for d in to_delete]).delete() pb.done() reset_queries() if self.need_instance_map and not self.m2m_model: print "Retrieving instances" ids = dict( ObjectHistory.objects.filter(content_type=self.content_type, m2m_content_type=None, type=self.type).values_list( 'object_id', 'original_id')) self.instance_map.update( (ids.get(o.id, None), o) for o in self.model.objects.all()) self.instance_map.update(merged_ids) print " Added\tReadded\tDeleted\tUpdated\t Unch.\t Merged\t Errors\tNo hist" print "%7d\t%7d\t%7d\t%7d\t%7d\t%7d\t%7d\t%7d" % ( self.added, self.recreated, self.deleted, self.updated,
def handle(self, *args, **kwargs): system_field = get_system_field() collections = map(int, kwargs.get('collections') or list()) mapping_file = kwargs.get('mapping_file') if not collections: print "--collection is a required parameter" return if not mapping_file: print "--mapping is a required parameter" return mappings = dict() with open(mapping_file, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: mappings[row['Identifier']] = (row['Work'], row['Primary']) related_field = Field.objects.get( standard__prefix='dc', name='relation', ) existing_works = FieldValue.objects.filter( record__collection__in=collections, field=related_field, refinement='IsPartOf', ) # Clean out old relations print "Deleting old works info" existing_works.delete() id_fields = standardfield_ids('identifier', equiv=True) print "Fetching records" identifiers = FieldValue.objects.select_related('record').filter( record__collection__in=collections, field__in=id_fields, ) pb = ProgressBar(identifiers.count()) # Insert new relations for count, identifier in enumerate(identifiers): work, isprimary = mappings.get(identifier.value, (None, False)) isprimary = isprimary == 'True' if not work: print "Warning: no entry found for identifier '%s'" % \ identifier.value continue FieldValue.objects.create( record=identifier.record, field=related_field, refinement='IsPartOf', value=work, hidden=True ) fv = list(FieldValue.objects.filter( record=identifier.record, field=system_field, label='primary-work-record' )) if len(fv) > 0: if not isprimary: for f in fv: f.delete() elif isprimary: FieldValue.objects.create( record=identifier.record, field=system_field, label='primary-work-record', value=work, hidden=True, ) pb.update(count) pb.done()