예제 #1
0
def pushSubject(subjectSet, project, imageLocations, metadata, livePost):

    if (livePost):
        subject = Subject()
        subject.links.project = project

        for image in imageLocations:
            subject.add_location(image)

        subject.metadata.update(metadata)

        notSaved = True
        while (notSaved):
            notSaved = False
            try:
                subject.save()
            except ConnectionError as e:
                print('{} , TRYING AGAIN'.format(e))
                notSaved = True

        subjectSet.add(subject)

        return subject

    else:
        return None
def make_tutorial_images(imagePaths, ellipseData, projectData):
    # Connect to Panoptes
    Panoptes.connect(
        username=projectData["user_name"], password=projectData["password"]
    )

    newSubjects = []
    for imageId, imagePath in enumerate(imagePaths):
        print(f"Adding {imagePath}...")
        try:
            subjectSet = SubjectSet.find(projectData["subject_set"])
        except PanoptesAPIException as e:
            print(e)
            return
        newSubject = Subject()
        newSubject.add_location(imagePath)
        newSubject.links.project = subjectSet.links.project
        newSubject.metadata.update(
            make_metadata(
                ellipseData.get_group(imageId).reset_index(drop=True), imagePath
            )
        )
        newSubject.save()
        newSubjects.append(newSubject)
    subjectSet.add(newSubjects)
예제 #3
0
def uploadSubjectToSet(project, subjectSet, locationsList, metadataList):
    print('Uploading {} subjects to {}'.format(len(locationsList), subjectSet))
    # imagePath can be string or list, metadata must be same dimension
    if not len(locationsList) == len(metadataList):
        print(
            '\t\033[31mInvalid arguments, locationsList and metadataList',
            'must have same length\033[0m'
        )
        return
    subjects = []
    for locations, meta in tqdm(zip(locationsList, metadataList)):
        # the json subjects need to be added in a more manual way so we can
        # specify a MIME type
        subjects.append(Subject())
        subjects[-1].links.project = project
        # comparison between model and image
        addLocation(subjects[-1], {'application/json': locations[1]})
        # actual galaxy image
        subjects[-1].add_location(locations[0])
        # and now just the model
        addLocation(subjects[-1], {'application/json': locations[2]})
        for k, v in meta.items():
            subjects[-1].metadata[k] = v
        try:
            subjects[-1].save()
        except RuntimeError:
            pass
    subjectSet.add(subjects)
    return subjectSet
    def push_new_row_subjects(self, source_subject, target_subject_set_id, row_paths_by_column):
        """
        Given image paths for the new column-indexed rows (row_paths_by_column), push new
        unclassified row subjects to the appropriate subject set, with metadata references to the
        source subject and column.
        """
        project = Project.find(settings.PROJECT_ID)

        subject_set_unclassified_rows = SubjectSet.find(target_subject_set_id)

        new_row_subjects = []

        for column_index, row_paths in row_paths_by_column.items():
            self._logger.info('Creating %d new row subjects for column index %d for subject %s',
                              len(row_paths), column_index, source_subject.id)
            for row_path in row_paths:
                new_subject = Subject()
                new_subject.links.project = project
                copy_source_metadata_fields = ['book', 'page']
                for copy_field in copy_source_metadata_fields:
                    new_subject.metadata[copy_field] = source_subject.metadata[copy_field]
                new_subject.metadata['source_document_subject_id'] = source_subject.id
                new_subject.metadata['source_document_column_index'] = column_index
                new_subject.add_location(row_path)
                new_subject.save()

                new_row_subjects.append(new_subject)

        subject_set_unclassified_rows.add(new_row_subjects)
예제 #5
0
def save_subject(manifest_item, project, pbar=None):
    """

    Add manifest item to project. Note: follow with subject_set.add(subject) to associate with subject set.
    Args:
        manifest_item (dict): of form {png_loc: img.png, key_data: some_data_dict}
        project (str): project to upload subject too e.g. '5773' for Galaxy Zoo
        pbar (tqdm.tqdm): progress bar to update. If None, no bar will display.

    Returns:
        None
    """
    subject = Subject()

    subject.links.project = project
    assert os.path.exists(manifest_item['png_loc'])
    subject.add_location(manifest_item['png_loc'])
    subject.metadata.update(manifest_item['key_data'])

    subject.save()

    if pbar:
        pbar.update()

    return subject
예제 #6
0
    def add_new_subject(self, image_list, metadata_list, subject_set_name):
        """
        Add a subject and the metadata.  image_list and metadata_list must be
        of equal length
        :param image_list: list of images to be added
        :param metadata_list: list of metadata to be added
        :return:
        """

        # Start by making sure we have two equal length list
        if len(image_list) != len(metadata_list):
            print("Image list and metadata list do not match")

        # Link to the subject set we want
        subject_set = SubjectSet()
        subject_set.links.project = self.project
        subject_set.display_name = subject_set_name
        subject_set.save()

        # Go through the image and metadata list and add the items
        new_subjects = []
        for i in range(len(image_list)):
            subject = Subject()
            subject.links.project = self.project
            subject.add_location(image_list[i])
            subject.metadata.update(metadata_list[i])
            subject.save()
            new_subjects.append(subject)

        subject_set.add(new_subjects)
예제 #7
0
def create_subject(project, metadata, media_files):
    subject = Subject()
    subject.links.project = project
    for media_file in media_files:
        subject.add_location(media_file)
    subject.metadata.update(metadata)
    subject.save()
    return subject
예제 #8
0
    def _create_subject(self, project_id, filename, metadata=None):
        subject = Subject()

        subject.links.project = Project.find(project_id)
        subject.add_location(filename)

        if metadata:
            subject.metadata.update(metadata)

        subject.save()

        return subject
def create_subject(project, media_files, metadata):
    """ Create a subject
        Args:
        - project: a Project() object defining the Zooniverse project
        - media_files: a list of media files to link to the subject
        - metadata: a dictionary with metadata to attach
    """
    subject = Subject()
    subject.links.project = project
    for media in media_files:
        subject.add_location(media)
    subject.metadata.update(metadata)
    subject.save()
    return subject
예제 #10
0
    def link_new_set(self, subject_set_id):
        """

        :param subject_set_id:
        :return:
        """
        workflowSet = Workflow()
        subject = Subject()
        subject.links.project = self.project
        sset = subject.find(subject_set_id)

        print(1)
        workflowSet.links.project = self.project
        print(2)
        workflowSet.links.sub(sset)
예제 #11
0
def upload_subject(locations: List, project: Project, subject_set_name: str,
                   metadata: Dict):
    subject = Subject()
    # add files
    subject.links.project = project
    for location in locations:
        if not os.path.isfile(location):
            raise FileNotFoundError(
                'Missing subject location: {}'.format(location))
        subject.add_location(location)

    subject.metadata.update(metadata)

    subject_set_name = subject_set_name
    subject_set = get_or_create_subject_set(project.id, subject_set_name)

    subject.save()
    subject_set.add(subject)
    return subject.id
예제 #12
0
def upload_images(id, use_database=True):
    print('Create subject set and upload images for', id)
    if use_database:
        update_status(id, gz_status='Uploading')
    wd = os.getcwd()
    Panoptes.connect(username='******',
                     password=os.environ['PANOPTES_PASSWORD'])
    os.chdir(target + id)
    project = Project.find(slug='chrismrp/radio-galaxy-zoo-lofar')
    subject_set = SubjectSet()

    subject_set.display_name = id
    subject_set.links.project = project
    subject_set.save()
    print('Made subject set')
    new_subjects = []
    g = glob.glob('*-manifest.txt')
    for i, f in enumerate(g):
        bits = open(f).readlines()[0].split(',')
        metadata = {
            'subject_id': int(bits[0]),
            'ra': float(bits[5]),
            'dec': float(bits[6]),
            '#size': float(bits[7]),
            'source_name': bits[4]
        }
        print('Upload doing', bits[4], '%i/%i' % (i, len(g)))
        subject = Subject()
        subject.links.project = project
        subject.metadata.update(metadata)
        for location in bits[1:4]:
            subject.add_location(location)
        subject.save()
        new_subjects.append(subject)

    subject_set.add(new_subjects)

    workflow = Workflow(11973)
    workflow.links.subject_sets.add(subject_set)
    if use_database:
        update_status(id, gz_status='In progress')
    print('Done!')
예제 #13
0
 def _create_subjects_from_epicollect5(self, project, subjects_metadata):
     subjects = list()
     for metadata in subjects_metadata:
         subject = Subject()
         subject.metadata['id'] = metadata['id']
         subject.metadata['project'] = metadata['project']
         subject.metadata['obs_type'] = metadata['obs_type']
         subject.metadata['source'] = metadata['source']
         subject.metadata['url'] = metadata['url']
         subject.metadata['created_at'] = metadata['created_at']
         subject.metadata['observer'] = metadata['observer']
         subject.metadata['longitude'] = metadata['location']['longitude']
         subject.metadata['latitude'] = metadata['location']['latitude']
         subject.metadata['comment'] = metadata['comment']
         subject.metadata['spectrum_type'] = metadata.get(
             'spectrum_type', "?")
         subject.add_location({'image/jpg': metadata['url']})
         subject.links.project = project
         subject.save()
         subjects.append(subject)
     return subjects
예제 #14
0
    def create_subjects_and_link_to_project(self, proto_subjects, project_id,
                                            workflow_id, subject_set_id):

        try:
            USERNAME = os.getenv('PANOPTES_USERNAME')
            PASSWORD = os.getenv('PANOPTES_PASSWORD')
            Panoptes.connect(username=USERNAME,
                             password=PASSWORD,
                             endpoint=self.ENDPOINT)

            project = Project.find(project_id)
            workflow = Workflow().find(workflow_id)

            if subject_set_id == None:
                subject_set = SubjectSet()
                ts = time.gmtime()
                subject_set.display_name = time.strftime(
                    "%m-%d-%Y %H:%M:%S", ts)
                subject_set.links.project = project

                subject_set.save()
            else:
                subject_set = SubjectSet().find(subject_set_id)
            subjects = []
            for proto_subject in proto_subjects:
                subject = Subject()
                subject.links.project = project
                subject.add_location(proto_subject['location_lc'])
                subject.add_location(proto_subject['location_ps'])
                subject.metadata.update(proto_subject['metadata'])
                subject.save()
                subjects.append(subject)

            subject_set.add(subjects)
            workflow.add_subject_sets(subject_set)
        except Exception:
            self.log.exception("Error in create_subjects_and_link_to_project ")
예제 #15
0
    def upload_chunks(self,
                      destination,
                      project_slug,
                      set_prefix,
                      zooniverse_login,
                      zooniverse_pwd,
                      batches=0,
                      **kwargs):
        self.destination = destination

        metadata_location = os.path.join(self.destination, 'chunks.csv')
        try:
            self.chunks = pd.read_csv(metadata_location, index_col='index')
        except:
            raise Exception(
                "cannot read chunk metadata in {}. Check the --destination parameter, and make sure you have extracted chunks before."
                .format(metadata_location))

        Panoptes.connect(username=zooniverse_login, password=zooniverse_pwd)
        zooniverse_project = Project.find(slug=project_slug)

        uploaded = 0
        for batch, chunks in self.chunks.groupby('batch'):
            if chunks['uploaded'].all():
                continue

            subjects_metadata = []

            subject_set = SubjectSet()
            subject_set.links.project = zooniverse_project
            subject_set.display_name = "{}_batch_{}".format(set_prefix, batch)
            subject_set.save()
            subjects = []

            _chunks = chunks.to_dict(orient='index')
            for chunk_index in _chunks:
                chunk = _chunks[chunk_index]

                print("uploading chunk {} ({},{}) in batch {}".format(
                    chunk['recording'], chunk['onset'], chunk['offset'],
                    batch))

                subject = Subject()
                subject.links.project = zooniverse_project
                subject.add_location(
                    os.path.join(self.destination, 'chunks', chunk['mp3']))
                subject.metadata['date_extracted'] = chunk['date_extracted']
                subject.save()
                subjects.append(subject)

                chunk['index'] = chunk_index
                chunk['zooniverse_id'] = subject.id
                chunk['project_slug'] = project_slug
                chunk['subject_set'] = str(subject_set.display_name)
                chunk['uploaded'] = True
                subjects_metadata.append(chunk)

            subject_set.add(subjects)

            self.chunks.update(
                pd.DataFrame(subjects_metadata).set_index('index'))

            self.chunks.to_csv(os.path.join(self.destination, 'chunks.csv'))
            uploaded += 1

            if batches > 0 and uploaded >= batches:
                return
            }

            segments.append(segment)
    print('Item segments transformation complete.')
    return segments

segments = transform_item_segments('https://www.loc.gov/item/' + LIBRARY_OF_CONGRESS_ITEM_ID)

Panoptes.connect(username=USERNAME, password=PASSWORD, endpoint=ENDPOINT)

project = Project.find(PROJECT)

subject_set = SubjectSet()
subject_set.links.project = project
subject_set.display_name = segments[0]['metadata']['Title'] # uses item Title as default subject set name, or feel free to hardcode
subject_set.save()

print('Begin Zooniverse subject upload...')
for segment in segments:
    subject = Subject()

    subject.links.project = project
    subject.add_location(segment['location'])

    subject.metadata.update(segment['metadata'])

    subject.save()
    subject_set.add(subject)

print("Zooniverse subject upload complete.")
예제 #17
0
    def upload_chunks(self,
                      chunks: str,
                      project_id: int,
                      set_name: str,
                      zooniverse_login="",
                      zooniverse_pwd="",
                      amount: int = 1000,
                      ignore_errors: bool = False,
                      **kwargs):
        """Uploads ``amount`` audio chunks from the CSV dataframe `chunks` to a zooniverse project.

        :param chunks: path to the chunk CSV dataframe
        :type chunks: [type]
        :param project_id: zooniverse project id
        :type project_id: int
        :param set_name: name of the subject set
        :type set_name: str
        :param zooniverse_login: zooniverse login. If not specified, the program attempts to get it from the environment variable ``ZOONIVERSE_LOGIN`` instead, defaults to ''
        :type zooniverse_login: str, optional
        :param zooniverse_pwd: zooniverse password. If not specified, the program attempts to get it from the environment variable ``ZOONIVERSE_PWD`` instead, defaults to ''
        :type zooniverse_pwd: str, optional
        :param amount: amount of chunks to upload, defaults to 0
        :type amount: int, optional
        """

        self.chunks_file = chunks
        self.get_credentials(zooniverse_login, zooniverse_pwd)

        metadata_location = os.path.join(self.chunks_file)
        try:
            self.chunks = pd.read_csv(metadata_location, index_col="index")
        except:
            raise Exception("cannot read chunk metadata from {}.".format(
                metadata_location))

        assert_dataframe("chunks", self.chunks)
        assert_columns_presence(
            "chunks",
            self.chunks,
            {"recording_filename", "onset", "offset", "uploaded", "mp3"},
        )

        from panoptes_client import Panoptes, Project, Subject, SubjectSet

        Panoptes.connect(username=self.zooniverse_login,
                         password=self.zooniverse_pwd)
        zooniverse_project = Project(project_id)

        subjects_metadata = []
        uploaded = 0

        subject_set = None

        for ss in zooniverse_project.links.subject_sets:
            if ss.display_name == set_name:
                subject_set = ss

        if subject_set is None:
            subject_set = SubjectSet()
            subject_set.links.project = zooniverse_project
            subject_set.display_name = set_name
            subject_set.save()

        subjects = []

        chunks_to_upload = self.chunks[self.chunks["uploaded"] == False].head(
            amount)
        chunks_to_upload = chunks_to_upload.to_dict(orient="index")

        if len(chunks_to_upload) == 0:
            print("nothing left to upload.")
            return

        for chunk_index in chunks_to_upload:
            chunk = chunks_to_upload[chunk_index]

            print("uploading chunk {} ({},{})".format(
                chunk["recording_filename"], chunk["onset"], chunk["offset"]))

            subject = Subject()
            subject.links.project = zooniverse_project
            subject.add_location(
                os.path.join(os.path.dirname(self.chunks_file), "chunks",
                             chunk["mp3"]))
            subject.metadata["date_extracted"] = chunk["date_extracted"]

            try:
                subject.save()
            except Exception as e:
                print("failed to save chunk {}. an exception has occured:\n{}".
                      format(chunk_index, str(e)))
                print(traceback.format_exc())

                if args.ignore_errors:
                    continue
                else:
                    print("subject upload halting here.")
                    break

            subjects.append(subject)

            chunk["index"] = chunk_index
            chunk["zooniverse_id"] = str(subject.id)
            chunk["project_id"] = str(project_id)
            chunk["subject_set"] = str(subject_set.display_name)
            chunk["uploaded"] = True
            subjects_metadata.append(chunk)

        if len(subjects) == 0:
            return

        subject_set.add(subjects)

        self.chunks.update(pd.DataFrame(subjects_metadata).set_index("index"))

        self.chunks.to_csv(self.chunks_file)
예제 #18
0
# delete the tmp file after the images have been resized

try:
    Panoptes.connect(username=zcfg.login['user'], password=zcfg.login['pass'])
    project = Project.find("6307")
except Exception as e:
    f = open(logfile, "a")
    t = time.localtime()

    f.write('Unable to connect to Zooniverse: '+time.strftime("%D:%H:%M:%S",t)+'\n')
    f.close()


subject_set = SubjectSet()
s = Subject()

subject_set.links.project = project
subject_set.display_name = 'Tutorial subject set 2'

images = glob.glob(path)
new_subjects = []

for img in images:
    try:
        s = Subject()
        s.links.project = project
        # manifest file
        if os.path.splitext(img)[1] == ".csv":   # upload manifest info.... not sure how this will be set up after second step
            # move csv to complete images folder
            shutil.copy(f, completed_images)
예제 #19
0
def create_subjects_and_link_to_project(proto_subjects,
                                        project_id,
                                        subject_set_id,
                                        subject_set_name=None):
    ''' find the project and relevant subject set. Get the existing subject data and compare to the new proto_subjects.
    Upload any instances of nbew subjects to the project

    Keyword Arguments:
    proto_subjects -- dictionary structure containing subject filepath+filename, and associated metadata
    project_id -- identifier to find and link with the project
    subject_set_id -- identifier for the subject set of interest
    '''

    # get the project object
    project = Project.find(project_id)

    # set up subject_set
    if subject_set_id == None:
        subject_set = SubjectSet()  # create empty subject_set
        subject_set.links.project = project

        if subject_set_name == None:  # if not defined generate a random subject set name to avoid error when a set already exists
            subject_set_name = 'subject_set_{:02d}_{:02d}_{:04d}_{}'.format(
                date.day, date.month, date.year,
                ''.join(generate_random_str()))
        print("will create a subject set called: {}".format(subject_set_name))
        subject_set.display_name = subject_set_name  # set the name of the subject set
        subject_set.save()
        project.reload()
    else:
        subject_set = SubjectSet().find(
            subject_set_id)  # find the existing subject_set
        existing_subject_set_name = subject_set.display_name  # get its name

        # if you have tried to set the subject set name, check that it matches the name for the chosen subject set id
        if (subject_set_name != None) and (existing_subject_set_name !=
                                           subject_set_name):
            print(
                "your chosen subject set name does not match the existing name: {}, {}"
                .format(subject_set_name, existing_subject_set_name))
            return -1
        else:
            subject_set_name = existing_subject_set_name

        print("add to existing subject set: {}".format(subject_set_name))

    # Create a list of the existing subject metadata
    meta_list = []
    print("existing subjects:")
    for subject in subject_set.subjects:
        print(subject.id, subject.metadata)
        meta_list.append(subject.metadata)

    # When making list of subjects to add, check to see if the metadata of the subject you want to add is already in the set
    print("new subjects:")
    new_subjects = []
    for filename, metadata in proto_subjects.items():

        # check if this subject is already in the subject set
        if np.isin(metadata, meta_list):
            print("{}, subject already in set".format(metadata))
            # In this case we skip over the subject that already exists.
            # N.B. you may want to remove an existing subject and update it with the new one
            continue

        # Otherwise we can add the subject to the new subject list
        else:
            subject = Subject()

            subject.links.project = project
            subject.add_location(filename)

            subject.metadata.update(metadata)

            subject.save()
            new_subjects.append(subject)
            print("{}, new subject add to list".format(metadata))

    print("new subjects to add: {}".format(new_subjects))

    # add the new subject list (data and metadata) to the already defined project subject set
    subject_set.add(new_subjects)

    return
예제 #20
0
def upload_subjects(
    subject_set_id,
    manifest_files,
    allow_missing,
    remote_location,
    mime_type,
    file_column,
):
    """
    Uploads subjects from each of the given MANIFEST_FILES.

    Example with only local files:

    $ panoptes subject-set upload-subjects 4667 manifest.csv

    Local filenames will be automatically detected in the manifest and
    uploaded, or filename columns can be specified with --file-column.

    If you are hosting your media yourself, you can put the URLs in the
    manifest and specify the column number(s):

    $ panoptes subject-set upload-subjects -r 1 4667 manifest.csv

    $ panoptes subject-set upload-subjects -r 1 -r 2 4667 manifest.csv

    Any local files will still be detected and uploaded.
    """
    if (
        len(manifest_files) > 1
        and any(map(lambda m: m.endswith('.yaml'), manifest_files))
    ):
        click.echo(
            'Error: YAML manifests must be processed one at a time.',
            err=True,
        )
        return -1
    elif manifest_files[0].endswith('.yaml'):
        with open(manifest_files[0], 'r') as yaml_manifest:
            upload_state = yaml.load(yaml_manifest, Loader=yaml.FullLoader)
        if upload_state['state_version'] > CURRENT_STATE_VERSION:
            click.echo(
                'Error: {} was generated by a newer version of the Panoptes '
                'CLI and is not compatible with this version.'.format(
                    manifest_files[0],
                ),
                err=True,
            )
            return -1
        if upload_state['subject_set_id'] != subject_set_id:
            click.echo(
                'Warning: You specified subject set {} but this YAML '
                'manifest is for subject set {}.'.format(
                    subject_set_id,
                    upload_state['subject_set_id'],
                ),
                err=True,
            )
            click.confirm(
                'Upload {} to subject set {} ({})?'.format(
                    manifest_files[0],
                    subject_set_id,
                    SubjectSet.find(subject_set_id).display_name,
                ),
                abort=True
            )
            upload_state['subject_set_id'] = subject_set_id
        resumed_upload = True
    else:
        upload_state = {
            'state_version': CURRENT_STATE_VERSION,
            'subject_set_id': subject_set_id,
            'manifest_files': manifest_files,
            'allow_missing': allow_missing,
            'remote_location': remote_location,
            'mime_type': mime_type,
            'file_column': file_column,
            'waiting_to_upload': [],
            'waiting_to_link': {},
        }
        resumed_upload = False

    remote_location_count = len(upload_state['remote_location'])
    mime_type_count = len(upload_state['mime_type'])
    if remote_location_count > 1 and mime_type_count == 1:
        upload_state['mime_type'] = (
            upload_state['mime_type'] * remote_location_count
        )
    elif remote_location_count > 0 and mime_type_count != remote_location_count:
        click.echo(
            'Error: The number of MIME types given must be either 1 or equal '
            'to the number of remote locations.',
            err=True,
        )
        return -1

    def validate_file(file_path):
        if not os.path.isfile(file_path):
            click.echo(
                'Error: File "{}" could not be found.'.format(
                    file_path,
                ),
                err=True,
            )
            return False

        file_size = os.path.getsize(file_path)
        if file_size == 0:
            click.echo(
                'Error: File "{}" is empty.'.format(
                    file_path,
                ),
                err=True,
            )
            return False
        elif file_size > MAX_UPLOAD_FILE_SIZE:
            click.echo(
                'Error: File "{}" is {}, larger than the maximum {}.'.format(
                    file_path,
                    humanize.naturalsize(file_size),
                    humanize.naturalsize(MAX_UPLOAD_FILE_SIZE),
                ),
                err=True,
            )
            return False
        return True

    subject_set = SubjectSet.find(upload_state['subject_set_id'])
    if not resumed_upload:
        subject_rows = []
        for manifest_file in upload_state['manifest_files']:
            with open(manifest_file, 'U') as manifest_f:
                file_root = os.path.dirname(manifest_file)
                r = csv.reader(manifest_f, skipinitialspace=True)
                headers = next(r)
                for row in r:
                    metadata = dict(zip(headers, row))
                    files = []
                    if not upload_state['file_column']:
                        upload_state['file_column'] = []
                        for field_number, col in enumerate(row, start=1):
                            file_path = os.path.join(file_root, col)
                            if os.path.exists(file_path):
                                upload_state['file_column'].append(
                                    field_number,
                                )
                                if not validate_file(file_path):
                                    return -1
                                files.append(file_path)
                    else:
                        for field_number in upload_state['file_column']:
                            file_path = os.path.join(
                                file_root,
                                row[field_number - 1]
                            )
                            if not validate_file(file_path):
                                return -1
                            files.append(file_path)

                    for field_number, _mime_type in zip(
                        upload_state['remote_location'],
                        upload_state['mime_type'],
                    ):
                        files.append({_mime_type: row[field_number - 1]})

                    if len(files) == 0:
                        click.echo(
                            'Could not find any files in row:',
                            err=True,
                        )
                        click.echo(','.join(row), err=True)
                        if not upload_state['allow_missing']:
                            return -1
                        else:
                            continue
                    subject_rows.append((files, metadata))

                if not subject_rows:
                    click.echo(
                        'File {} did not contain any rows.'.format(
                            manifest_file,
                        ),
                        err=True,
                    )
                    return -1

        subject_rows = list(enumerate(subject_rows))
        upload_state['waiting_to_upload'] = copy.deepcopy(subject_rows)
    else:
        for subject_id, subject_row in upload_state['waiting_to_link'].items():
            try:
                subject = Subject.find(subject_id)
            except PanoptesAPIException:
                upload_state['waiting_to_upload'].append(subject_row)
                del upload_state['waiting_to_link'][subject_id]
        subject_rows = copy.deepcopy(upload_state['waiting_to_upload'])

    pending_subjects = []

    def move_created(limit):
        while len(pending_subjects) > limit:
            for subject, subject_row in pending_subjects:
                if subject.async_save_result:
                    pending_subjects.remove((subject, subject_row))
                    upload_state['waiting_to_upload'].remove(subject_row)
                    upload_state['waiting_to_link'][subject.id] = subject_row
            time.sleep(0.5)

    def link_subjects(limit):
        if len(upload_state['waiting_to_link']) > limit:
            subject_set.add(list(upload_state['waiting_to_link'].keys()))
            upload_state['waiting_to_link'].clear()

    with click.progressbar(
        subject_rows,
        length=len(subject_rows),
        label='Uploading subjects',
    ) as _subject_rows:
        try:
            with Subject.async_saves():
                for subject_row in _subject_rows:
                    count, (files, metadata) = subject_row
                    subject = Subject()
                    subject.links.project = subject_set.links.project
                    for media_file in files:
                        subject.add_location(media_file)
                    subject.metadata.update(metadata)
                    subject.save()

                    pending_subjects.append((subject, subject_row))

                    move_created(MAX_PENDING_SUBJECTS)
                    link_subjects(LINK_BATCH_SIZE)

            move_created(0)
            link_subjects(0)
        finally:
            if (
                len(pending_subjects) > 0
                or len(upload_state['waiting_to_link']) > 0
            ):
                click.echo('Error: Upload failed.', err=True)
                if click.confirm(
                    'Would you like to save the upload state to resume the '
                    'upload later?',
                    default=True,
                ):
                    while True:
                        state_file_name = 'panoptes-upload-{}.yaml'.format(
                            subject_set_id,
                        )
                        state_file_name = click.prompt(
                            'Enter filename to save to',
                            default=state_file_name,
                        )

                        if not state_file_name.endswith('.yaml'):
                            click.echo(
                                'Error: File name must end in ".yaml".',
                                err=True,
                            )
                            if click.confirm(
                                'Save to {}.yaml?'.format(state_file_name),
                                default=True,
                            ):
                                state_file_name += '.yaml'
                            else:
                                continue
                        if not is_valid_filename(state_file_name):
                            click.echo(
                                'Error: {} is not a valid file name'.format(
                                    state_file_name,
                                ),
                                err=True,
                            )
                            sanitized_filename = sanitize_filename(
                                state_file_name,
                            )
                            if click.confirm(
                                'Save to {}?'.format(
                                    sanitized_filename,
                                ),
                                default=True,
                            ):
                                state_file_name = sanitized_filename
                            else:
                                continue
                        if os.path.exists(state_file_name):
                            if not click.confirm(
                                'File {} already exists. Overwrite?'.format(
                                    state_file_name,
                                ),
                                default=False,
                            ):
                                continue
                        break

                    with open(state_file_name, 'w') as state_file:
                        yaml.dump(upload_state, state_file)
예제 #21
0
# find or build destination subject set
try:
    # check if the subject set already exits
    subject_set_new = SubjectSet.where(project_id=proj.id, display_name=new_set_name).next()
except StopIteration:
    # create a new subject set for the new data and link it to the project above
    subject_set_new = SubjectSet()
    subject_set_new.links.project = proj
    subject_set_new.display_name = new_set_name
    subject_set_new.save()

#  iterate through the subjects duplicating them and verifying they are created.
k = 0
for old_sub in add_subjects:
    old_subject = Subject(old_sub)
    try:
        new_subject = Subject()
        new_subject.links.project = proj
        for loc in old_subject.locations:
            new_subject.add_location(loc)
        new_subject.metadata = old_subject.metadata
        new_subject.save()
        subject_set_new.add(new_subject)
        print(new_subject.id, 'duplicated in new set to new set')
        k += 1
    except panoptes_client.panoptes.PanoptesAPIException:
        print(old_sub,  'did not duplicate correctly', str(sys.exc_info()[1]))
print(k, ' subjects linked to subject set ', new_set_name, ' in project ', proj_id)

linked = 0
예제 #22
0
input_file = args.input_file
if not os.path.exists(input_file):
    print('[%s] does not exist.' % input_file)
    sys.exit()

output_file = input_file.split(
    '.')[0] + '_with_locations' + '.' + input_file.split('.')[1]
with open(input_file, 'r') as in_file:

    in_put = csv.reader(in_file, dialect='excel')
    headers = in_put.__next__()
    headers.append('subject_locations')
    with open(output_file, 'w', newline='') as out_file:
        write_added = csv.writer(out_file, delimiter=',')
        write_added.writerow(headers)
        line_counter = 0
        for line in in_put:
            try:
                subject = Subject(line[0])
                line.append(subject.locations[0]['image/jpeg'])
            except KeyError:
                print(line[0],
                      'Did not find a subject image file for this subject')
            if line_counter % 25 == 0:
                print('.')
            write_added.writerow(line)
            line_counter += 1
        print('Added subject locations to', input_file, '. Rewritten as',
              output_file, ',', line_counter, 'subjects located.')
예제 #23
0
class TestUploadSubject:
    @patch('theia.api.models.Pipeline.name_subject_set',
           return_value='pipeline name')
    @patch('theia.operations.panoptes_operations.UploadSubject._connect')
    @patch(
        'theia.operations.panoptes_operations.UploadSubject._get_subject_set',
        return_value=SubjectSet())
    @patch(
        'theia.operations.panoptes_operations.UploadSubject._create_subject',
        return_value=Subject())
    @patch('panoptes_client.SubjectSet.add')
    def test_apply_single(self, mockAdd, mockCreate, mockGet, mockConnect,
                          mockGetName, *args):
        project = Project(id=8)
        pipeline = Pipeline(project=project)
        bundle = JobBundle(pipeline=pipeline)

        operation = UploadSubject(bundle)
        operation.apply(['some_file'])

        mockConnect.assert_called_once()
        mockGetName.assert_called_once()
        mockGet.assert_called_once_with(pipeline, 8, 'pipeline name')
        mockCreate.assert_called_once_with(8, 'some_file')
        mockAdd.assert_called_once_with(mockCreate.return_value)

    @patch('theia.api.models.JobBundle.name_subject_set',
           return_value='bundle name')
    @patch('theia.operations.panoptes_operations.UploadSubject._connect')
    @patch(
        'theia.operations.panoptes_operations.UploadSubject._get_subject_set',
        return_value=SubjectSet())
    @patch(
        'theia.operations.panoptes_operations.UploadSubject._create_subject',
        return_value=Subject())
    @patch('panoptes_client.SubjectSet.add')
    def test_apply_multiple(self, mockAdd, mockCreate, mockGet, mockConnect,
                            mockGetName, *args):
        project = Project(id=8)
        pipeline = Pipeline(project=project, multiple_subject_sets=True)
        bundle = JobBundle(pipeline=pipeline)

        operation = UploadSubject(bundle)
        operation.apply(['some_file'])

        mockConnect.assert_called_once()
        mockGetName.assert_called_once()
        mockGet.assert_called_once_with(bundle, 8, 'bundle name')
        mockCreate.assert_called_once_with(8, 'some_file')
        mockAdd.assert_called_once_with(mockCreate.return_value)

    @patch('panoptes_client.Panoptes.connect')
    @patch('theia.utils.PanoptesUtils.base_url', return_value='sample url')
    @patch('theia.utils.PanoptesUtils.client_id', return_value='sample id')
    @patch('theia.utils.PanoptesUtils.client_secret',
           return_value='sample secret')
    def test__connect(self, mockSecret, mockId, mockUrl, mockConnect):
        operation = UploadSubject(JobBundle())
        operation._connect()
        mockUrl.assert_called_once()
        mockId.assert_called_once()
        mockSecret.assert_called_once()
        mockConnect.assert_called_once_with(endpoint='sample url',
                                            client_id='sample id',
                                            client_secret='sample secret')

    @patch('theia.api.models.JobBundle.save')
    @patch('theia.api.models.Pipeline.save')
    @patch(
        'theia.operations.panoptes_operations.UploadSubject._create_subject_set'
    )
    @patch('panoptes_client.SubjectSet.find', autospec=True)
    def test__get_subject_set(self, mockFind, mockCreateSet, *args):
        mockFind.reset_mock()
        mockCreateSet.reset_mock()

        emptyJobBundle = JobBundle()
        linkedJobBundle = JobBundle(subject_set_id=3)
        emptyPipeline = Pipeline()
        linkedPipeline = Pipeline(subject_set_id=3)

        operation = UploadSubject(emptyJobBundle)
        result = operation._get_subject_set(emptyJobBundle, 8, 'some name')
        mockFind.assert_not_called()
        mockCreateSet.assert_called_once_with(8, 'some name')

        mockFind.reset_mock()
        mockCreateSet.reset_mock()

        operation = UploadSubject(linkedJobBundle)
        result = operation._get_subject_set(linkedJobBundle, 8, 'some name')
        mockFind.assert_called_once_with(3)
        mockCreateSet.assert_not_called()

        mockFind.reset_mock()
        mockCreateSet.reset_mock()

        operation = UploadSubject(emptyPipeline)
        result = operation._get_subject_set(emptyPipeline, 8, 'some name')
        mockFind.assert_not_called()
        mockCreateSet.assert_called_once_with(8, 'some name')

        mockFind.reset_mock()
        mockCreateSet.reset_mock()

        operation = UploadSubject(linkedPipeline)
        result = operation._get_subject_set(linkedPipeline, 8, 'some name')
        mockFind.assert_called_once_with(3)
        mockCreateSet.assert_not_called()

    @patch('panoptes_client.Project.find', return_value=Mock())
    @patch('panoptes_client.Subject.save', autospec=True)
    @patch('panoptes_client.Subject.add_location', autospec=True)
    def test__create_subject_no_metadata(self, mockAdd, mockSave, mockFind):
        operation = UploadSubject(None)
        created_subject = operation._create_subject(1, 'some_file')
        mockFind.assert_called_once_with(1)
        mockAdd.assert_called_once_with(created_subject, 'some_file')  # weird
        mockSave.assert_called_once()
        assert (mockFind.return_value == created_subject.links.project.id
                )  # weird

    @patch('panoptes_client.Project.find', return_value=Mock())
    @patch('panoptes_client.Subject.save', autospec=True)
    @patch('panoptes_client.Subject.add_location', autospec=True)
    def test__create_subject_with_metadata(self, mockAdd, mockSave, mockFind):
        operation = UploadSubject(None)
        created_subject = operation._create_subject(1, 'some_file',
                                                    {'foo': 'bar'})
        mockFind.assert_called_once_with(1)
        mockAdd.assert_called_once_with(created_subject, 'some_file')  # weird
        mockSave.assert_called_once()
        assert (mockFind.return_value == created_subject.links.project.id
                )  # weird
        assert (created_subject.metadata == {'foo': 'bar'})

    @patch('panoptes_client.Project.find', return_value=Mock())
    @patch('panoptes_client.SubjectSet.save', autospec=True)
    def test__create_subject_set(self, mockSave, mockFind):
        operation = UploadSubject(None)
        created_set = operation._create_subject_set(1, 'some name')

        mockFind.assert_called_once_with(1)
        mockSave.assert_called_once()
        assert (created_set.display_name == 'some name')
        assert (mockFind.return_value == created_set.links.project.id)  # weird
예제 #24
0
def main():
    ap = argparse.ArgumentParser(
        description=
        'Given a list of images, bins them into subject sets of size n')

    # require file path to read in images
    ap.add_argument('-f',
                    '--filename',
                    required=True,
                    dest='filename',
                    type=str,
                    help='The name of the file from which to read the images')

    # optionally require subject set size; defaults to 1000
    ap.add_argument(
        '-n',
        '--size',
        required=False,
        dest='n',
        type=int,
        default=1000,
        help='The maximum number of images a subject set should contain. \
                          The value should be between 1 and 10000, inclusive')

    # parse args into variables and check values
    args = vars(ap.parse_args())

    filename = args['filename'] if args['filename'] else None
    n = args['n'] if args['n'] else None

    if not (n >= 1 and n <= 10000):
        raise ValueError('n must be between 1 and 10000, inclusive')

    # connect to zooniverse
    Panoptes.connect(username=zooniverse_config.Zooniverse_USERNAME,
                     password=zooniverse_config.Zooniverse_PASS)
    project = Project.find(zooniverse_config.Project_ID)

    # connection to mongodb
    mongoConn = MongoClient(csh_db_config.DB_HOST + ":" +
                            str(csh_db_config.DB_PORT))
    cshTransDB = mongoConn[csh_db_config.TRANSCRIPTION_DB_NAME]
    cshTransDB.authenticate(csh_db_config.TRANSCRIPTION_DB_USER,
                            csh_db_config.TRANSCRIPTION_DB_PASS)
    cshCollection = cshTransDB[csh_db_config.TRANS_DB_MeetingMinColl]

    # track subject sets being created
    subjectSets = []

    # get the image filenames in a Python list
    with open(filename) as handle:
        filenames = handle.readlines()

    # divide files into groups of n
    filegroups = list([e for e in t if e != None]
                      for t in itertools.zip_longest(*([iter(filenames)] * n)))

    for group in filegroups:
        displayName = '{:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now())

        # create a new subject set
        subjectSet = SubjectSet()
        subjectSet.links.project = project
        subjectSet.display_name = displayName
        subjectSet.save()

        subjectSetId = subjectSet.id
        subjectSets.append(subjectSetId)

        # create a new subject for each file and add to the subject set
        for filename in group:
            # remove trailing '\n' character
            filename = filename.rstrip()

            # create a new subject
            subject = Subject()
            subject.links.project = project

            filepath = cshCollection.find_one({'_id':
                                               filename})['file']['anonPath']
            subject.add_location(filepath)
            subject.metadata['ID'] = filename
            subject.save()

            # add to subject set
            subjectSet.add(subject)

            # retrieve and update the record from mongodb
            updateQuery = {
                '$set': {
                    'canCrowdsource': True,
                    'transcription': {
                        'numClassifications': 5,
                        'subjectSetId': subjectSetId,
                        'status': 'sent'
                    }
                }
            }
            record = cshCollection.find_one_and_update({'_id': filename},
                                                       updateQuery)

    # add subject sets to the workflow
    workflow = project.links.workflows[0]
    workflow.add_subject_sets(subjectSets)

    # print helpful information to the console
    print('{} subject sets created with the following IDs: {}'.format(
        len(subjectSets), subjectSets))