images = [a['src'] for a in soup.find_all("img", {"src": re.compile("gstatic.com")})]
#print images
for img in images:
  raw_img = urllib2.urlopen(img).read()
  #add the directory for your image here
  DIR="images/"
  cntr = len([i for i in os.listdir(DIR) if image_type in i]) + 1
  f = open(DIR + image_type + "_"+ str(cntr)+".jpg", 'wb')
  f.write(raw_img)
  f.close()

print 'Creating image set...'

# create the subject set.
subject_set = SubjectSet()
subject_set.links.project = p
subject_set.display_name = "Images of " + thing + '\'s'
subject_set.save()

print 'Uploading images to Zooniverse...'

# add all images to subject set
for i in range(1,21):
    subject = Subject()
    subject.links.project = p
    subject.add_location('images/' + str(thing) + '_' + str(i)+'.jpg')
    subject.save()
    subject_set.add(subject)

print 'Complete.'
def upload_manifest_to_galaxy_zoo(
        subject_set_name,
        manifest,
        project_id='5733',  # default to main GZ project
        login_loc='zooniverse_login.txt'):
    """
    Save manifest (set of galaxies with metadata prepared) to Galaxy Zoo

    Args:
        subject_set_name (str): name for subject set
        manifest (list): containing dicts of form {png_loc: img.png, key_data: {metadata_col: metadata_value}}
        project_id (str): panoptes project id e.g. '5733' for Galaxy Zoo, '6490' for mobile
        n_processes (int): number of processes with which to upload galaxies in parallel

    Returns:
        None
    """
    assert os.path.exists(login_loc)
    if 'TEST' in subject_set_name:
        logging.warning('Testing mode detected - not uploading!')
        return manifest

    if project_id == '5733':
        logging.info('Uploading to Galaxy Zoo project 5733')
    elif project_id == '6490':
        logging.info('Uploading to mobile app project 6490')
    elif project_id == '8751':
        logging.info('Uploading to staging project 8751')
    else:
        logging.info('Uploading to unknown project {}'.format(project_id))

    # Important - don't commit the password!
    zooniverse_login = read_data_from_txt(login_loc)
    Panoptes.connect(**zooniverse_login)

    project = Project.find(project_id)

    # check if subject set already exists
    subject_set = None
    subject_sets = SubjectSet.where(project_id=project_id)
    for candidate_subject_set in subject_sets:
        if candidate_subject_set.raw['display_name'] == subject_set_name:
            # use if it already exists
            subject_set = candidate_subject_set
    if not subject_set:  # make a new one if not
        subject_set = SubjectSet()
        subject_set.links.project = project
        subject_set.display_name = subject_set_name
        subject_set.save()

    pbar = tqdm(total=len(manifest), unit=' subjects uploaded')

    save_subject_params = {'project': project, 'pbar': pbar}
    save_subject_partial = functools.partial(save_subject,
                                             **save_subject_params)

    # upload in async blocks, to avoid huge join at end
    manifest_block_start = 0
    manifest_block_size = 100

    while True:
        manifest_block = manifest[manifest_block_start:manifest_block_start +
                                  manifest_block_size]

        new_subjects = []
        with Subject.async_saves():
            for manifest_entry in manifest_block:
                new_subjects.append(save_subject_partial(manifest_entry))

        subject_set.add(new_subjects)
        logging.info('{} subjects linked'.format(new_subjects))

        manifest_block_start += manifest_block_size
        if manifest_block_start > len(manifest):
            break

    return manifest  # for debugging only
    subject_set = SubjectSet()
    subject_set.links.project = project
    subject_set.display_name = set_name
    subject_set.save()

print 'Uploading subjects, this could take a while!'
new_subjects = 0
for filename, metadata in subject_metadata.items():
    try:
        if filename not in previous_subjects:
            subject = Subject()
            subject.links.project = project
            subject.add_location(location + os.sep + filename)
            subject.metadata.update(metadata)
            subject.save()
            print filename
            subject_set.add(subject.id)
            new_subjects += 1
    except panoptes_client.panoptes.PanoptesAPIException:
        print 'An error occurred during the upload of ', filename
print new_subjects, 'new subjects created and uploaded'

uploaded = 0
with open(location + os.sep + 'Uploaded subjects.csv', 'wt') as file_up:
    subject_set = SubjectSet.where(project_id=project.id,
                                   display_name=set_name).next()
    for subject in subject_set.subjects:
        uploaded += 1
        file_up.write(subject.id + ',' + (subject.metadata.values())[0] + '\n')
    print uploaded, ' subjects found in the subject set, see the full list in Uploaded subjects.csv.'
Exemplo n.º 4
0
def main():
    ap = argparse.ArgumentParser(
        description=
        'Given a list of images, bins them into subject sets of size n')

    # require file path to read in images
    ap.add_argument('-f',
                    '--filename',
                    required=True,
                    dest='filename',
                    type=str,
                    help='The name of the file from which to read the images')

    # optionally require subject set size; defaults to 1000
    ap.add_argument(
        '-n',
        '--size',
        required=False,
        dest='n',
        type=int,
        default=1000,
        help='The maximum number of images a subject set should contain. \
                          The value should be between 1 and 10000, inclusive')

    # parse args into variables and check values
    args = vars(ap.parse_args())

    filename = args['filename'] if args['filename'] else None
    n = args['n'] if args['n'] else None

    if not (n >= 1 and n <= 10000):
        raise ValueError('n must be between 1 and 10000, inclusive')

    # connect to zooniverse
    Panoptes.connect(username=zooniverse_config.Zooniverse_USERNAME,
                     password=zooniverse_config.Zooniverse_PASS)
    project = Project.find(zooniverse_config.Project_ID)

    # connection to mongodb
    mongoConn = MongoClient(csh_db_config.DB_HOST + ":" +
                            str(csh_db_config.DB_PORT))
    cshTransDB = mongoConn[csh_db_config.TRANSCRIPTION_DB_NAME]
    cshTransDB.authenticate(csh_db_config.TRANSCRIPTION_DB_USER,
                            csh_db_config.TRANSCRIPTION_DB_PASS)
    cshCollection = cshTransDB[csh_db_config.TRANS_DB_MeetingMinColl]

    # track subject sets being created
    subjectSets = []

    # get the image filenames in a Python list
    with open(filename) as handle:
        filenames = handle.readlines()

    # divide files into groups of n
    filegroups = list([e for e in t if e != None]
                      for t in itertools.zip_longest(*([iter(filenames)] * n)))

    for group in filegroups:
        displayName = '{:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now())

        # create a new subject set
        subjectSet = SubjectSet()
        subjectSet.links.project = project
        subjectSet.display_name = displayName
        subjectSet.save()

        subjectSetId = subjectSet.id
        subjectSets.append(subjectSetId)

        # create a new subject for each file and add to the subject set
        for filename in group:
            # remove trailing '\n' character
            filename = filename.rstrip()

            # create a new subject
            subject = Subject()
            subject.links.project = project

            filepath = cshCollection.find_one({'_id':
                                               filename})['file']['anonPath']
            subject.add_location(filepath)
            subject.metadata['ID'] = filename
            subject.save()

            # add to subject set
            subjectSet.add(subject)

            # retrieve and update the record from mongodb
            updateQuery = {
                '$set': {
                    'canCrowdsource': True,
                    'transcription': {
                        'numClassifications': 5,
                        'subjectSetId': subjectSetId,
                        'status': 'sent'
                    }
                }
            }
            record = cshCollection.find_one_and_update({'_id': filename},
                                                       updateQuery)

    # add subject sets to the workflow
    workflow = project.links.workflows[0]
    workflow.add_subject_sets(subjectSets)

    # print helpful information to the console
    print('{} subject sets created with the following IDs: {}'.format(
        len(subjectSets), subjectSets))