def run(self): """ Migrate segmented railroad rows. """ subjects_and_their_target_sets = self._calculate_target_subject_sets_by_subject( ) segmented_rows_and_their_target_sets = self \ ._segmented_row_target_sets(subjects_and_their_target_sets) additions_by_target_set = defaultdict(list) removals_by_target_set = defaultdict(list) for subject_id, target_subject_set_id in segmented_rows_and_their_target_sets.items( ): # target_subject_set = self._get_subject_set(target_subject_set_id) self._logger.debug('Saving segmented row %d to set: %s', subject_id, target_subject_set_id) subject = Subject.find(subject_id) additions_by_target_set[target_subject_set_id].append(subject) for curr_subject_set in subject.links.subject_sets: removals_by_target_set[curr_subject_set.id].append(subject_id) # Remove to appropriate target sets for target_subject_set_id, new_subjects in additions_by_target_set.items( ): target_subject_set = self._get_subject_set(target_subject_set_id) target_subject_set.add(new_subjects)
def _hydrate_book_and_page(cls, row): subject = Subject.find(row['subject_id']) subject_model = SubjectModel(subject) subject.metadata['book'] = subject_model['book'] subject.metadata['page'] = subject_model['page'] for field in cls.BOOK_AND_PAGE_FIELDS: if subject.metadata[field] is None: raise ValueError("WARN: None '%s' for subject %d and filepath %s" % field, subject.id, subject.metadata['filepath']) subject.save()
def link_new_set(self, subject_set_id): """ :param subject_set_id: :return: """ workflowSet = Workflow() subject = Subject() subject.links.project = self.project sset = subject.find(subject_set_id) print(1) workflowSet.links.project = self.project print(2) workflowSet.links.sub(sset)
def queue_new_subject_creation(cls, subject_id, vertex_centroids, target_subject_set_id): """ Given subject ID and vertex centroids, fetch subject image and perform segmentation. Static-w/-instance-of-self pattern to support enqueuing in RQ. """ logger = setup_logger(cls.LOGGER_NAME, 'log/queue_operations.log') queue_ops = QueueOperations(logger) subject = Subject.find(subject_id) subject_image_path = queue_ops.fetch_subject_image_to_tmp(subject) column_image_paths = queue_ops.perform_column_segmentation( subject_id, subject_image_path, vertex_centroids ) for column_image_path in column_image_paths: queue_ops.upscale_small_images(column_image_path) row_paths_by_column = queue_ops.perform_row_segmentation(column_image_paths) queue_ops.push_new_row_subjects(subject, target_subject_set_id, row_paths_by_column)
def ls(subject_set_id, quiet, subject_ids): """ Lists subject IDs and their media URLs. """ if subject_ids: for subject_id in subject_ids: subject = Subject.find(subject_id) if quiet: click.echo(subject.id) else: echo_subject(subject) return subjects = Subject.where(subject_set_id=subject_set_id) if quiet: click.echo(" ".join([s.id for s in subjects])) else: for subject in subjects: echo_subject(subject)
with open(SUBJECT_ID_FILE) as subject_id_f: subject_ids = [ s.strip() for s in subject_id_f.readlines() ] Panoptes.connect(**config) with ChargingBar( 'Updating', max=len(subject_ids), suffix='%(percent).1f%% %(eta_td)s' ) as bar: with Subject.async_saves(): for subject_id in subject_ids: bar.next() subject = Subject.find(subject_id) if '!CERiT' in subject.metadata: continue superwasp_id = subject.metadata.get('Filename', subject.metadata.get('filename')).split('_')[0] coords = superwasp_id.replace('1SWASP', '') coords_quoted = urllib.parse.quote(coords) ra = urllib.parse.quote('{}:{}:{}'.format( coords[1:3], coords[3:5], coords[5:10] )) dec = urllib.parse.quote('{}:{}:{}'.format( coords[10:13], coords[13:15],
def run(): """ Query for completed subjects, calculate kmeans vertex centroids, fetch subject images, split columns by centroids, row segmentatino with Ocropy. """ logger = setup_logger(settings.APP_NAME, 'log/kmeans_and_enqueue_completed_subjects.log', logging.DEBUG) subject_set_csv = SubjectSetCSV() workflow_router = SubjectSetWorkflowRouter(subject_set_csv, settings, logger) pages_raw_subject_ids = subject_set_csv.raw_pages_subject_ids() logger.debug("Running Wires and Rails Workflow Processor") Panoptes.connect(username=settings.PANOPTES_USERNAME, password=settings.PANOPTES_PASSWORD) retired_subject_ids = [] vertices_and_target_subject_sets = [] for _subject_set_id, metadata in settings.COLUMNS_WORKFLOW_METADATA.items( ): logger.debug("Loading vertices / subject retirement info for %(debug_name)s subject set " \ "(subject set id: %(subject_set_id)d; workflow id: %(workflow_id)d; task id: " \ " %(task_id)s", metadata) classification_kwargs = { 'scope': 'project', 'project_id': settings.PROJECT_ID, 'workflow_id': metadata['workflow_id'] } logger.debug("Loading classifications by params %s", str(classification_kwargs)) classifications_records = [ c for c in Classification.where(**classification_kwargs) ] classifications = VertexClassifications(classifications_records, pages_raw_subject_ids) # Aggregate vertex centroids centroids_by_subject = classifications.vertex_centroids( metadata['task_id']) for subject_id, centroids in centroids_by_subject.items(): # Find target subject set ID, or log and skip the subject try: target_subject_set_id = workflow_router \ .target_subject_set_id(subject_id, classifications_records) except UnidentifiedRawSubjectSetException as ex: logger.error(ex.args[0]) continue except SharedMajorityException as ex: # TODO need add'l monitoring for this, e.g. manual report exception logger.error(ex.args[0]) continue vertices_and_target_subject_sets.append( [subject_id, centroids, target_subject_set_id]) # Aggregate retired subjects workflow = Workflow.find(metadata['workflow_id']) retirement_count = workflow.retirement['options']['count'] retired_subject_ids += classifications.retired_subject_ids( metadata['task_id'], retirement_count) logger.debug( 'Retrieved the following subject centroids for image segmentation: %s', str(vertices_and_target_subject_sets)) logger.debug('For the following retired subject IDs: %s', str(retired_subject_ids)) queue = Queue(connection=Redis(host=settings.REDIS_HOST)) for subject_id, centroids, target_subject_set_id in vertices_and_target_subject_sets: if subject_id not in retired_subject_ids: continue subject = Subject.find(subject_id) if settings.METADATA_KEY_ALREADY_PROCESSED in subject.metadata and \ subject.metadata[settings.METADATA_KEY_ALREADY_PROCESSED]: logger.debug('Skipping subject id %d; already processed.', subject_id) continue logger.debug('Enqueuing subjects id: %d', subject_id) queue.enqueue(QueueOperations.queue_new_subject_creation, subject_id, centroids, target_subject_set_id, timeout=2 * 60 * 60) QueueOperations.flag_subject_as_queued(subject)
def delete(force, subject_ids): for subject_id in subject_ids: if not force: click.confirm('Delete subject {}?'.format(subject_id), abort=True) Subject.find(subject_id).delete()
def info(subject_id): subject = Subject.find(subject_id) click.echo(yaml.dump(subject.raw))
def get_subject(subject_id): return Subject.find(subject_id)
def ls(subject_id): echo_subject(Subject.find(subject_id))