def pushSubject(subjectSet, project, imageLocations, metadata, livePost): if (livePost): subject = Subject() subject.links.project = project for image in imageLocations: subject.add_location(image) subject.metadata.update(metadata) notSaved = True while (notSaved): notSaved = False try: subject.save() except ConnectionError as e: print('{} , TRYING AGAIN'.format(e)) notSaved = True subjectSet.add(subject) return subject else: return None
def make_tutorial_images(imagePaths, ellipseData, projectData): # Connect to Panoptes Panoptes.connect( username=projectData["user_name"], password=projectData["password"] ) newSubjects = [] for imageId, imagePath in enumerate(imagePaths): print(f"Adding {imagePath}...") try: subjectSet = SubjectSet.find(projectData["subject_set"]) except PanoptesAPIException as e: print(e) return newSubject = Subject() newSubject.add_location(imagePath) newSubject.links.project = subjectSet.links.project newSubject.metadata.update( make_metadata( ellipseData.get_group(imageId).reset_index(drop=True), imagePath ) ) newSubject.save() newSubjects.append(newSubject) subjectSet.add(newSubjects)
def uploadSubjectToSet(project, subjectSet, locationsList, metadataList): print('Uploading {} subjects to {}'.format(len(locationsList), subjectSet)) # imagePath can be string or list, metadata must be same dimension if not len(locationsList) == len(metadataList): print( '\t\033[31mInvalid arguments, locationsList and metadataList', 'must have same length\033[0m' ) return subjects = [] for locations, meta in tqdm(zip(locationsList, metadataList)): # the json subjects need to be added in a more manual way so we can # specify a MIME type subjects.append(Subject()) subjects[-1].links.project = project # comparison between model and image addLocation(subjects[-1], {'application/json': locations[1]}) # actual galaxy image subjects[-1].add_location(locations[0]) # and now just the model addLocation(subjects[-1], {'application/json': locations[2]}) for k, v in meta.items(): subjects[-1].metadata[k] = v try: subjects[-1].save() except RuntimeError: pass subjectSet.add(subjects) return subjectSet
def push_new_row_subjects(self, source_subject, target_subject_set_id, row_paths_by_column): """ Given image paths for the new column-indexed rows (row_paths_by_column), push new unclassified row subjects to the appropriate subject set, with metadata references to the source subject and column. """ project = Project.find(settings.PROJECT_ID) subject_set_unclassified_rows = SubjectSet.find(target_subject_set_id) new_row_subjects = [] for column_index, row_paths in row_paths_by_column.items(): self._logger.info('Creating %d new row subjects for column index %d for subject %s', len(row_paths), column_index, source_subject.id) for row_path in row_paths: new_subject = Subject() new_subject.links.project = project copy_source_metadata_fields = ['book', 'page'] for copy_field in copy_source_metadata_fields: new_subject.metadata[copy_field] = source_subject.metadata[copy_field] new_subject.metadata['source_document_subject_id'] = source_subject.id new_subject.metadata['source_document_column_index'] = column_index new_subject.add_location(row_path) new_subject.save() new_row_subjects.append(new_subject) subject_set_unclassified_rows.add(new_row_subjects)
def save_subject(manifest_item, project, pbar=None): """ Add manifest item to project. Note: follow with subject_set.add(subject) to associate with subject set. Args: manifest_item (dict): of form {png_loc: img.png, key_data: some_data_dict} project (str): project to upload subject too e.g. '5773' for Galaxy Zoo pbar (tqdm.tqdm): progress bar to update. If None, no bar will display. Returns: None """ subject = Subject() subject.links.project = project assert os.path.exists(manifest_item['png_loc']) subject.add_location(manifest_item['png_loc']) subject.metadata.update(manifest_item['key_data']) subject.save() if pbar: pbar.update() return subject
def add_new_subject(self, image_list, metadata_list, subject_set_name): """ Add a subject and the metadata. image_list and metadata_list must be of equal length :param image_list: list of images to be added :param metadata_list: list of metadata to be added :return: """ # Start by making sure we have two equal length list if len(image_list) != len(metadata_list): print("Image list and metadata list do not match") # Link to the subject set we want subject_set = SubjectSet() subject_set.links.project = self.project subject_set.display_name = subject_set_name subject_set.save() # Go through the image and metadata list and add the items new_subjects = [] for i in range(len(image_list)): subject = Subject() subject.links.project = self.project subject.add_location(image_list[i]) subject.metadata.update(metadata_list[i]) subject.save() new_subjects.append(subject) subject_set.add(new_subjects)
def create_subject(project, metadata, media_files): subject = Subject() subject.links.project = project for media_file in media_files: subject.add_location(media_file) subject.metadata.update(metadata) subject.save() return subject
def _create_subject(self, project_id, filename, metadata=None): subject = Subject() subject.links.project = Project.find(project_id) subject.add_location(filename) if metadata: subject.metadata.update(metadata) subject.save() return subject
def create_subject(project, media_files, metadata): """ Create a subject Args: - project: a Project() object defining the Zooniverse project - media_files: a list of media files to link to the subject - metadata: a dictionary with metadata to attach """ subject = Subject() subject.links.project = project for media in media_files: subject.add_location(media) subject.metadata.update(metadata) subject.save() return subject
def link_new_set(self, subject_set_id): """ :param subject_set_id: :return: """ workflowSet = Workflow() subject = Subject() subject.links.project = self.project sset = subject.find(subject_set_id) print(1) workflowSet.links.project = self.project print(2) workflowSet.links.sub(sset)
def upload_subject(locations: List, project: Project, subject_set_name: str, metadata: Dict): subject = Subject() # add files subject.links.project = project for location in locations: if not os.path.isfile(location): raise FileNotFoundError( 'Missing subject location: {}'.format(location)) subject.add_location(location) subject.metadata.update(metadata) subject_set_name = subject_set_name subject_set = get_or_create_subject_set(project.id, subject_set_name) subject.save() subject_set.add(subject) return subject.id
def upload_images(id, use_database=True): print('Create subject set and upload images for', id) if use_database: update_status(id, gz_status='Uploading') wd = os.getcwd() Panoptes.connect(username='******', password=os.environ['PANOPTES_PASSWORD']) os.chdir(target + id) project = Project.find(slug='chrismrp/radio-galaxy-zoo-lofar') subject_set = SubjectSet() subject_set.display_name = id subject_set.links.project = project subject_set.save() print('Made subject set') new_subjects = [] g = glob.glob('*-manifest.txt') for i, f in enumerate(g): bits = open(f).readlines()[0].split(',') metadata = { 'subject_id': int(bits[0]), 'ra': float(bits[5]), 'dec': float(bits[6]), '#size': float(bits[7]), 'source_name': bits[4] } print('Upload doing', bits[4], '%i/%i' % (i, len(g))) subject = Subject() subject.links.project = project subject.metadata.update(metadata) for location in bits[1:4]: subject.add_location(location) subject.save() new_subjects.append(subject) subject_set.add(new_subjects) workflow = Workflow(11973) workflow.links.subject_sets.add(subject_set) if use_database: update_status(id, gz_status='In progress') print('Done!')
def _create_subjects_from_epicollect5(self, project, subjects_metadata): subjects = list() for metadata in subjects_metadata: subject = Subject() subject.metadata['id'] = metadata['id'] subject.metadata['project'] = metadata['project'] subject.metadata['obs_type'] = metadata['obs_type'] subject.metadata['source'] = metadata['source'] subject.metadata['url'] = metadata['url'] subject.metadata['created_at'] = metadata['created_at'] subject.metadata['observer'] = metadata['observer'] subject.metadata['longitude'] = metadata['location']['longitude'] subject.metadata['latitude'] = metadata['location']['latitude'] subject.metadata['comment'] = metadata['comment'] subject.metadata['spectrum_type'] = metadata.get( 'spectrum_type', "?") subject.add_location({'image/jpg': metadata['url']}) subject.links.project = project subject.save() subjects.append(subject) return subjects
def create_subjects_and_link_to_project(self, proto_subjects, project_id, workflow_id, subject_set_id): try: USERNAME = os.getenv('PANOPTES_USERNAME') PASSWORD = os.getenv('PANOPTES_PASSWORD') Panoptes.connect(username=USERNAME, password=PASSWORD, endpoint=self.ENDPOINT) project = Project.find(project_id) workflow = Workflow().find(workflow_id) if subject_set_id == None: subject_set = SubjectSet() ts = time.gmtime() subject_set.display_name = time.strftime( "%m-%d-%Y %H:%M:%S", ts) subject_set.links.project = project subject_set.save() else: subject_set = SubjectSet().find(subject_set_id) subjects = [] for proto_subject in proto_subjects: subject = Subject() subject.links.project = project subject.add_location(proto_subject['location_lc']) subject.add_location(proto_subject['location_ps']) subject.metadata.update(proto_subject['metadata']) subject.save() subjects.append(subject) subject_set.add(subjects) workflow.add_subject_sets(subject_set) except Exception: self.log.exception("Error in create_subjects_and_link_to_project ")
def upload_chunks(self, destination, project_slug, set_prefix, zooniverse_login, zooniverse_pwd, batches=0, **kwargs): self.destination = destination metadata_location = os.path.join(self.destination, 'chunks.csv') try: self.chunks = pd.read_csv(metadata_location, index_col='index') except: raise Exception( "cannot read chunk metadata in {}. Check the --destination parameter, and make sure you have extracted chunks before." .format(metadata_location)) Panoptes.connect(username=zooniverse_login, password=zooniverse_pwd) zooniverse_project = Project.find(slug=project_slug) uploaded = 0 for batch, chunks in self.chunks.groupby('batch'): if chunks['uploaded'].all(): continue subjects_metadata = [] subject_set = SubjectSet() subject_set.links.project = zooniverse_project subject_set.display_name = "{}_batch_{}".format(set_prefix, batch) subject_set.save() subjects = [] _chunks = chunks.to_dict(orient='index') for chunk_index in _chunks: chunk = _chunks[chunk_index] print("uploading chunk {} ({},{}) in batch {}".format( chunk['recording'], chunk['onset'], chunk['offset'], batch)) subject = Subject() subject.links.project = zooniverse_project subject.add_location( os.path.join(self.destination, 'chunks', chunk['mp3'])) subject.metadata['date_extracted'] = chunk['date_extracted'] subject.save() subjects.append(subject) chunk['index'] = chunk_index chunk['zooniverse_id'] = subject.id chunk['project_slug'] = project_slug chunk['subject_set'] = str(subject_set.display_name) chunk['uploaded'] = True subjects_metadata.append(chunk) subject_set.add(subjects) self.chunks.update( pd.DataFrame(subjects_metadata).set_index('index')) self.chunks.to_csv(os.path.join(self.destination, 'chunks.csv')) uploaded += 1 if batches > 0 and uploaded >= batches: return
} segments.append(segment) print('Item segments transformation complete.') return segments segments = transform_item_segments('https://www.loc.gov/item/' + LIBRARY_OF_CONGRESS_ITEM_ID) Panoptes.connect(username=USERNAME, password=PASSWORD, endpoint=ENDPOINT) project = Project.find(PROJECT) subject_set = SubjectSet() subject_set.links.project = project subject_set.display_name = segments[0]['metadata']['Title'] # uses item Title as default subject set name, or feel free to hardcode subject_set.save() print('Begin Zooniverse subject upload...') for segment in segments: subject = Subject() subject.links.project = project subject.add_location(segment['location']) subject.metadata.update(segment['metadata']) subject.save() subject_set.add(subject) print("Zooniverse subject upload complete.")
def upload_chunks(self, chunks: str, project_id: int, set_name: str, zooniverse_login="", zooniverse_pwd="", amount: int = 1000, ignore_errors: bool = False, **kwargs): """Uploads ``amount`` audio chunks from the CSV dataframe `chunks` to a zooniverse project. :param chunks: path to the chunk CSV dataframe :type chunks: [type] :param project_id: zooniverse project id :type project_id: int :param set_name: name of the subject set :type set_name: str :param zooniverse_login: zooniverse login. If not specified, the program attempts to get it from the environment variable ``ZOONIVERSE_LOGIN`` instead, defaults to '' :type zooniverse_login: str, optional :param zooniverse_pwd: zooniverse password. If not specified, the program attempts to get it from the environment variable ``ZOONIVERSE_PWD`` instead, defaults to '' :type zooniverse_pwd: str, optional :param amount: amount of chunks to upload, defaults to 0 :type amount: int, optional """ self.chunks_file = chunks self.get_credentials(zooniverse_login, zooniverse_pwd) metadata_location = os.path.join(self.chunks_file) try: self.chunks = pd.read_csv(metadata_location, index_col="index") except: raise Exception("cannot read chunk metadata from {}.".format( metadata_location)) assert_dataframe("chunks", self.chunks) assert_columns_presence( "chunks", self.chunks, {"recording_filename", "onset", "offset", "uploaded", "mp3"}, ) from panoptes_client import Panoptes, Project, Subject, SubjectSet Panoptes.connect(username=self.zooniverse_login, password=self.zooniverse_pwd) zooniverse_project = Project(project_id) subjects_metadata = [] uploaded = 0 subject_set = None for ss in zooniverse_project.links.subject_sets: if ss.display_name == set_name: subject_set = ss if subject_set is None: subject_set = SubjectSet() subject_set.links.project = zooniverse_project subject_set.display_name = set_name subject_set.save() subjects = [] chunks_to_upload = self.chunks[self.chunks["uploaded"] == False].head( amount) chunks_to_upload = chunks_to_upload.to_dict(orient="index") if len(chunks_to_upload) == 0: print("nothing left to upload.") return for chunk_index in chunks_to_upload: chunk = chunks_to_upload[chunk_index] print("uploading chunk {} ({},{})".format( chunk["recording_filename"], chunk["onset"], chunk["offset"])) subject = Subject() subject.links.project = zooniverse_project subject.add_location( os.path.join(os.path.dirname(self.chunks_file), "chunks", chunk["mp3"])) subject.metadata["date_extracted"] = chunk["date_extracted"] try: subject.save() except Exception as e: print("failed to save chunk {}. an exception has occured:\n{}". format(chunk_index, str(e))) print(traceback.format_exc()) if args.ignore_errors: continue else: print("subject upload halting here.") break subjects.append(subject) chunk["index"] = chunk_index chunk["zooniverse_id"] = str(subject.id) chunk["project_id"] = str(project_id) chunk["subject_set"] = str(subject_set.display_name) chunk["uploaded"] = True subjects_metadata.append(chunk) if len(subjects) == 0: return subject_set.add(subjects) self.chunks.update(pd.DataFrame(subjects_metadata).set_index("index")) self.chunks.to_csv(self.chunks_file)
# delete the tmp file after the images have been resized try: Panoptes.connect(username=zcfg.login['user'], password=zcfg.login['pass']) project = Project.find("6307") except Exception as e: f = open(logfile, "a") t = time.localtime() f.write('Unable to connect to Zooniverse: '+time.strftime("%D:%H:%M:%S",t)+'\n') f.close() subject_set = SubjectSet() s = Subject() subject_set.links.project = project subject_set.display_name = 'Tutorial subject set 2' images = glob.glob(path) new_subjects = [] for img in images: try: s = Subject() s.links.project = project # manifest file if os.path.splitext(img)[1] == ".csv": # upload manifest info.... not sure how this will be set up after second step # move csv to complete images folder shutil.copy(f, completed_images)
def create_subjects_and_link_to_project(proto_subjects, project_id, subject_set_id, subject_set_name=None): ''' find the project and relevant subject set. Get the existing subject data and compare to the new proto_subjects. Upload any instances of nbew subjects to the project Keyword Arguments: proto_subjects -- dictionary structure containing subject filepath+filename, and associated metadata project_id -- identifier to find and link with the project subject_set_id -- identifier for the subject set of interest ''' # get the project object project = Project.find(project_id) # set up subject_set if subject_set_id == None: subject_set = SubjectSet() # create empty subject_set subject_set.links.project = project if subject_set_name == None: # if not defined generate a random subject set name to avoid error when a set already exists subject_set_name = 'subject_set_{:02d}_{:02d}_{:04d}_{}'.format( date.day, date.month, date.year, ''.join(generate_random_str())) print("will create a subject set called: {}".format(subject_set_name)) subject_set.display_name = subject_set_name # set the name of the subject set subject_set.save() project.reload() else: subject_set = SubjectSet().find( subject_set_id) # find the existing subject_set existing_subject_set_name = subject_set.display_name # get its name # if you have tried to set the subject set name, check that it matches the name for the chosen subject set id if (subject_set_name != None) and (existing_subject_set_name != subject_set_name): print( "your chosen subject set name does not match the existing name: {}, {}" .format(subject_set_name, existing_subject_set_name)) return -1 else: subject_set_name = existing_subject_set_name print("add to existing subject set: {}".format(subject_set_name)) # Create a list of the existing subject metadata meta_list = [] print("existing subjects:") for subject in subject_set.subjects: print(subject.id, subject.metadata) meta_list.append(subject.metadata) # When making list of subjects to add, check to see if the metadata of the subject you want to add is already in the set print("new subjects:") new_subjects = [] for filename, metadata in proto_subjects.items(): # check if this subject is already in the subject set if np.isin(metadata, meta_list): print("{}, subject already in set".format(metadata)) # In this case we skip over the subject that already exists. # N.B. you may want to remove an existing subject and update it with the new one continue # Otherwise we can add the subject to the new subject list else: subject = Subject() subject.links.project = project subject.add_location(filename) subject.metadata.update(metadata) subject.save() new_subjects.append(subject) print("{}, new subject add to list".format(metadata)) print("new subjects to add: {}".format(new_subjects)) # add the new subject list (data and metadata) to the already defined project subject set subject_set.add(new_subjects) return
def upload_subjects( subject_set_id, manifest_files, allow_missing, remote_location, mime_type, file_column, ): """ Uploads subjects from each of the given MANIFEST_FILES. Example with only local files: $ panoptes subject-set upload-subjects 4667 manifest.csv Local filenames will be automatically detected in the manifest and uploaded, or filename columns can be specified with --file-column. If you are hosting your media yourself, you can put the URLs in the manifest and specify the column number(s): $ panoptes subject-set upload-subjects -r 1 4667 manifest.csv $ panoptes subject-set upload-subjects -r 1 -r 2 4667 manifest.csv Any local files will still be detected and uploaded. """ if ( len(manifest_files) > 1 and any(map(lambda m: m.endswith('.yaml'), manifest_files)) ): click.echo( 'Error: YAML manifests must be processed one at a time.', err=True, ) return -1 elif manifest_files[0].endswith('.yaml'): with open(manifest_files[0], 'r') as yaml_manifest: upload_state = yaml.load(yaml_manifest, Loader=yaml.FullLoader) if upload_state['state_version'] > CURRENT_STATE_VERSION: click.echo( 'Error: {} was generated by a newer version of the Panoptes ' 'CLI and is not compatible with this version.'.format( manifest_files[0], ), err=True, ) return -1 if upload_state['subject_set_id'] != subject_set_id: click.echo( 'Warning: You specified subject set {} but this YAML ' 'manifest is for subject set {}.'.format( subject_set_id, upload_state['subject_set_id'], ), err=True, ) click.confirm( 'Upload {} to subject set {} ({})?'.format( manifest_files[0], subject_set_id, SubjectSet.find(subject_set_id).display_name, ), abort=True ) upload_state['subject_set_id'] = subject_set_id resumed_upload = True else: upload_state = { 'state_version': CURRENT_STATE_VERSION, 'subject_set_id': subject_set_id, 'manifest_files': manifest_files, 'allow_missing': allow_missing, 'remote_location': remote_location, 'mime_type': mime_type, 'file_column': file_column, 'waiting_to_upload': [], 'waiting_to_link': {}, } resumed_upload = False remote_location_count = len(upload_state['remote_location']) mime_type_count = len(upload_state['mime_type']) if remote_location_count > 1 and mime_type_count == 1: upload_state['mime_type'] = ( upload_state['mime_type'] * remote_location_count ) elif remote_location_count > 0 and mime_type_count != remote_location_count: click.echo( 'Error: The number of MIME types given must be either 1 or equal ' 'to the number of remote locations.', err=True, ) return -1 def validate_file(file_path): if not os.path.isfile(file_path): click.echo( 'Error: File "{}" could not be found.'.format( file_path, ), err=True, ) return False file_size = os.path.getsize(file_path) if file_size == 0: click.echo( 'Error: File "{}" is empty.'.format( file_path, ), err=True, ) return False elif file_size > MAX_UPLOAD_FILE_SIZE: click.echo( 'Error: File "{}" is {}, larger than the maximum {}.'.format( file_path, humanize.naturalsize(file_size), humanize.naturalsize(MAX_UPLOAD_FILE_SIZE), ), err=True, ) return False return True subject_set = SubjectSet.find(upload_state['subject_set_id']) if not resumed_upload: subject_rows = [] for manifest_file in upload_state['manifest_files']: with open(manifest_file, 'U') as manifest_f: file_root = os.path.dirname(manifest_file) r = csv.reader(manifest_f, skipinitialspace=True) headers = next(r) for row in r: metadata = dict(zip(headers, row)) files = [] if not upload_state['file_column']: upload_state['file_column'] = [] for field_number, col in enumerate(row, start=1): file_path = os.path.join(file_root, col) if os.path.exists(file_path): upload_state['file_column'].append( field_number, ) if not validate_file(file_path): return -1 files.append(file_path) else: for field_number in upload_state['file_column']: file_path = os.path.join( file_root, row[field_number - 1] ) if not validate_file(file_path): return -1 files.append(file_path) for field_number, _mime_type in zip( upload_state['remote_location'], upload_state['mime_type'], ): files.append({_mime_type: row[field_number - 1]}) if len(files) == 0: click.echo( 'Could not find any files in row:', err=True, ) click.echo(','.join(row), err=True) if not upload_state['allow_missing']: return -1 else: continue subject_rows.append((files, metadata)) if not subject_rows: click.echo( 'File {} did not contain any rows.'.format( manifest_file, ), err=True, ) return -1 subject_rows = list(enumerate(subject_rows)) upload_state['waiting_to_upload'] = copy.deepcopy(subject_rows) else: for subject_id, subject_row in upload_state['waiting_to_link'].items(): try: subject = Subject.find(subject_id) except PanoptesAPIException: upload_state['waiting_to_upload'].append(subject_row) del upload_state['waiting_to_link'][subject_id] subject_rows = copy.deepcopy(upload_state['waiting_to_upload']) pending_subjects = [] def move_created(limit): while len(pending_subjects) > limit: for subject, subject_row in pending_subjects: if subject.async_save_result: pending_subjects.remove((subject, subject_row)) upload_state['waiting_to_upload'].remove(subject_row) upload_state['waiting_to_link'][subject.id] = subject_row time.sleep(0.5) def link_subjects(limit): if len(upload_state['waiting_to_link']) > limit: subject_set.add(list(upload_state['waiting_to_link'].keys())) upload_state['waiting_to_link'].clear() with click.progressbar( subject_rows, length=len(subject_rows), label='Uploading subjects', ) as _subject_rows: try: with Subject.async_saves(): for subject_row in _subject_rows: count, (files, metadata) = subject_row subject = Subject() subject.links.project = subject_set.links.project for media_file in files: subject.add_location(media_file) subject.metadata.update(metadata) subject.save() pending_subjects.append((subject, subject_row)) move_created(MAX_PENDING_SUBJECTS) link_subjects(LINK_BATCH_SIZE) move_created(0) link_subjects(0) finally: if ( len(pending_subjects) > 0 or len(upload_state['waiting_to_link']) > 0 ): click.echo('Error: Upload failed.', err=True) if click.confirm( 'Would you like to save the upload state to resume the ' 'upload later?', default=True, ): while True: state_file_name = 'panoptes-upload-{}.yaml'.format( subject_set_id, ) state_file_name = click.prompt( 'Enter filename to save to', default=state_file_name, ) if not state_file_name.endswith('.yaml'): click.echo( 'Error: File name must end in ".yaml".', err=True, ) if click.confirm( 'Save to {}.yaml?'.format(state_file_name), default=True, ): state_file_name += '.yaml' else: continue if not is_valid_filename(state_file_name): click.echo( 'Error: {} is not a valid file name'.format( state_file_name, ), err=True, ) sanitized_filename = sanitize_filename( state_file_name, ) if click.confirm( 'Save to {}?'.format( sanitized_filename, ), default=True, ): state_file_name = sanitized_filename else: continue if os.path.exists(state_file_name): if not click.confirm( 'File {} already exists. Overwrite?'.format( state_file_name, ), default=False, ): continue break with open(state_file_name, 'w') as state_file: yaml.dump(upload_state, state_file)
# find or build destination subject set try: # check if the subject set already exits subject_set_new = SubjectSet.where(project_id=proj.id, display_name=new_set_name).next() except StopIteration: # create a new subject set for the new data and link it to the project above subject_set_new = SubjectSet() subject_set_new.links.project = proj subject_set_new.display_name = new_set_name subject_set_new.save() # iterate through the subjects duplicating them and verifying they are created. k = 0 for old_sub in add_subjects: old_subject = Subject(old_sub) try: new_subject = Subject() new_subject.links.project = proj for loc in old_subject.locations: new_subject.add_location(loc) new_subject.metadata = old_subject.metadata new_subject.save() subject_set_new.add(new_subject) print(new_subject.id, 'duplicated in new set to new set') k += 1 except panoptes_client.panoptes.PanoptesAPIException: print(old_sub, 'did not duplicate correctly', str(sys.exc_info()[1])) print(k, ' subjects linked to subject set ', new_set_name, ' in project ', proj_id) linked = 0
input_file = args.input_file if not os.path.exists(input_file): print('[%s] does not exist.' % input_file) sys.exit() output_file = input_file.split( '.')[0] + '_with_locations' + '.' + input_file.split('.')[1] with open(input_file, 'r') as in_file: in_put = csv.reader(in_file, dialect='excel') headers = in_put.__next__() headers.append('subject_locations') with open(output_file, 'w', newline='') as out_file: write_added = csv.writer(out_file, delimiter=',') write_added.writerow(headers) line_counter = 0 for line in in_put: try: subject = Subject(line[0]) line.append(subject.locations[0]['image/jpeg']) except KeyError: print(line[0], 'Did not find a subject image file for this subject') if line_counter % 25 == 0: print('.') write_added.writerow(line) line_counter += 1 print('Added subject locations to', input_file, '. Rewritten as', output_file, ',', line_counter, 'subjects located.')
class TestUploadSubject: @patch('theia.api.models.Pipeline.name_subject_set', return_value='pipeline name') @patch('theia.operations.panoptes_operations.UploadSubject._connect') @patch( 'theia.operations.panoptes_operations.UploadSubject._get_subject_set', return_value=SubjectSet()) @patch( 'theia.operations.panoptes_operations.UploadSubject._create_subject', return_value=Subject()) @patch('panoptes_client.SubjectSet.add') def test_apply_single(self, mockAdd, mockCreate, mockGet, mockConnect, mockGetName, *args): project = Project(id=8) pipeline = Pipeline(project=project) bundle = JobBundle(pipeline=pipeline) operation = UploadSubject(bundle) operation.apply(['some_file']) mockConnect.assert_called_once() mockGetName.assert_called_once() mockGet.assert_called_once_with(pipeline, 8, 'pipeline name') mockCreate.assert_called_once_with(8, 'some_file') mockAdd.assert_called_once_with(mockCreate.return_value) @patch('theia.api.models.JobBundle.name_subject_set', return_value='bundle name') @patch('theia.operations.panoptes_operations.UploadSubject._connect') @patch( 'theia.operations.panoptes_operations.UploadSubject._get_subject_set', return_value=SubjectSet()) @patch( 'theia.operations.panoptes_operations.UploadSubject._create_subject', return_value=Subject()) @patch('panoptes_client.SubjectSet.add') def test_apply_multiple(self, mockAdd, mockCreate, mockGet, mockConnect, mockGetName, *args): project = Project(id=8) pipeline = Pipeline(project=project, multiple_subject_sets=True) bundle = JobBundle(pipeline=pipeline) operation = UploadSubject(bundle) operation.apply(['some_file']) mockConnect.assert_called_once() mockGetName.assert_called_once() mockGet.assert_called_once_with(bundle, 8, 'bundle name') mockCreate.assert_called_once_with(8, 'some_file') mockAdd.assert_called_once_with(mockCreate.return_value) @patch('panoptes_client.Panoptes.connect') @patch('theia.utils.PanoptesUtils.base_url', return_value='sample url') @patch('theia.utils.PanoptesUtils.client_id', return_value='sample id') @patch('theia.utils.PanoptesUtils.client_secret', return_value='sample secret') def test__connect(self, mockSecret, mockId, mockUrl, mockConnect): operation = UploadSubject(JobBundle()) operation._connect() mockUrl.assert_called_once() mockId.assert_called_once() mockSecret.assert_called_once() mockConnect.assert_called_once_with(endpoint='sample url', client_id='sample id', client_secret='sample secret') @patch('theia.api.models.JobBundle.save') @patch('theia.api.models.Pipeline.save') @patch( 'theia.operations.panoptes_operations.UploadSubject._create_subject_set' ) @patch('panoptes_client.SubjectSet.find', autospec=True) def test__get_subject_set(self, mockFind, mockCreateSet, *args): mockFind.reset_mock() mockCreateSet.reset_mock() emptyJobBundle = JobBundle() linkedJobBundle = JobBundle(subject_set_id=3) emptyPipeline = Pipeline() linkedPipeline = Pipeline(subject_set_id=3) operation = UploadSubject(emptyJobBundle) result = operation._get_subject_set(emptyJobBundle, 8, 'some name') mockFind.assert_not_called() mockCreateSet.assert_called_once_with(8, 'some name') mockFind.reset_mock() mockCreateSet.reset_mock() operation = UploadSubject(linkedJobBundle) result = operation._get_subject_set(linkedJobBundle, 8, 'some name') mockFind.assert_called_once_with(3) mockCreateSet.assert_not_called() mockFind.reset_mock() mockCreateSet.reset_mock() operation = UploadSubject(emptyPipeline) result = operation._get_subject_set(emptyPipeline, 8, 'some name') mockFind.assert_not_called() mockCreateSet.assert_called_once_with(8, 'some name') mockFind.reset_mock() mockCreateSet.reset_mock() operation = UploadSubject(linkedPipeline) result = operation._get_subject_set(linkedPipeline, 8, 'some name') mockFind.assert_called_once_with(3) mockCreateSet.assert_not_called() @patch('panoptes_client.Project.find', return_value=Mock()) @patch('panoptes_client.Subject.save', autospec=True) @patch('panoptes_client.Subject.add_location', autospec=True) def test__create_subject_no_metadata(self, mockAdd, mockSave, mockFind): operation = UploadSubject(None) created_subject = operation._create_subject(1, 'some_file') mockFind.assert_called_once_with(1) mockAdd.assert_called_once_with(created_subject, 'some_file') # weird mockSave.assert_called_once() assert (mockFind.return_value == created_subject.links.project.id ) # weird @patch('panoptes_client.Project.find', return_value=Mock()) @patch('panoptes_client.Subject.save', autospec=True) @patch('panoptes_client.Subject.add_location', autospec=True) def test__create_subject_with_metadata(self, mockAdd, mockSave, mockFind): operation = UploadSubject(None) created_subject = operation._create_subject(1, 'some_file', {'foo': 'bar'}) mockFind.assert_called_once_with(1) mockAdd.assert_called_once_with(created_subject, 'some_file') # weird mockSave.assert_called_once() assert (mockFind.return_value == created_subject.links.project.id ) # weird assert (created_subject.metadata == {'foo': 'bar'}) @patch('panoptes_client.Project.find', return_value=Mock()) @patch('panoptes_client.SubjectSet.save', autospec=True) def test__create_subject_set(self, mockSave, mockFind): operation = UploadSubject(None) created_set = operation._create_subject_set(1, 'some name') mockFind.assert_called_once_with(1) mockSave.assert_called_once() assert (created_set.display_name == 'some name') assert (mockFind.return_value == created_set.links.project.id) # weird
def main(): ap = argparse.ArgumentParser( description= 'Given a list of images, bins them into subject sets of size n') # require file path to read in images ap.add_argument('-f', '--filename', required=True, dest='filename', type=str, help='The name of the file from which to read the images') # optionally require subject set size; defaults to 1000 ap.add_argument( '-n', '--size', required=False, dest='n', type=int, default=1000, help='The maximum number of images a subject set should contain. \ The value should be between 1 and 10000, inclusive') # parse args into variables and check values args = vars(ap.parse_args()) filename = args['filename'] if args['filename'] else None n = args['n'] if args['n'] else None if not (n >= 1 and n <= 10000): raise ValueError('n must be between 1 and 10000, inclusive') # connect to zooniverse Panoptes.connect(username=zooniverse_config.Zooniverse_USERNAME, password=zooniverse_config.Zooniverse_PASS) project = Project.find(zooniverse_config.Project_ID) # connection to mongodb mongoConn = MongoClient(csh_db_config.DB_HOST + ":" + str(csh_db_config.DB_PORT)) cshTransDB = mongoConn[csh_db_config.TRANSCRIPTION_DB_NAME] cshTransDB.authenticate(csh_db_config.TRANSCRIPTION_DB_USER, csh_db_config.TRANSCRIPTION_DB_PASS) cshCollection = cshTransDB[csh_db_config.TRANS_DB_MeetingMinColl] # track subject sets being created subjectSets = [] # get the image filenames in a Python list with open(filename) as handle: filenames = handle.readlines() # divide files into groups of n filegroups = list([e for e in t if e != None] for t in itertools.zip_longest(*([iter(filenames)] * n))) for group in filegroups: displayName = '{:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now()) # create a new subject set subjectSet = SubjectSet() subjectSet.links.project = project subjectSet.display_name = displayName subjectSet.save() subjectSetId = subjectSet.id subjectSets.append(subjectSetId) # create a new subject for each file and add to the subject set for filename in group: # remove trailing '\n' character filename = filename.rstrip() # create a new subject subject = Subject() subject.links.project = project filepath = cshCollection.find_one({'_id': filename})['file']['anonPath'] subject.add_location(filepath) subject.metadata['ID'] = filename subject.save() # add to subject set subjectSet.add(subject) # retrieve and update the record from mongodb updateQuery = { '$set': { 'canCrowdsource': True, 'transcription': { 'numClassifications': 5, 'subjectSetId': subjectSetId, 'status': 'sent' } } } record = cshCollection.find_one_and_update({'_id': filename}, updateQuery) # add subject sets to the workflow workflow = project.links.workflows[0] workflow.add_subject_sets(subjectSets) # print helpful information to the console print('{} subject sets created with the following IDs: {}'.format( len(subjectSets), subjectSets))