def reindex_specific_data_type(data_type): FileProcessLock.lock() print "starting..." #this line will raise an error if something is wrong with the data type file_name_key = data_stream_to_s3_file_name_string(data_type) relevant_chunks = ChunksRegistry(data_type=data_type) relevant_indexed_files = [ chunk["chunk_path"] for chunk in relevant_chunks ] print "purging old data..." for chunk in relevant_chunks: chunk.remove() pool = ThreadPool(20) pool.map(s3_delete, relevant_indexed_files) print "pulling files to process..." files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] ) for i,l in enumerate(files_lists): print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files" for fp in l: if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS: FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1]) del files_lists, l pool.close() pool.terminate() print str(datetime.now()), "processing data..." FileProcessLock.unlock() process_file_chunks() print "Done."
def reindex_all_files_to_process(): """ Totally removes the FilesToProcess DB, deletes all chunked files on s3, clears the chunksregistry, and then adds all relevent files on s3 to the files to process registry. """ FileProcessLock.lock() print str(datetime.now()), "purging FilesToProcess:", FilesToProcess.count() FileToProcess.db().drop() print str(datetime.now()), "purging existing ChunksRegistry", ChunksRegistry.count() ChunkRegistry.db().drop() pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2 ) print str(datetime.now()), "deleting older chunked data:", CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER) print len(CHUNKED_DATA) pool.map(s3_delete, CHUNKED_DATA) del CHUNKED_DATA print str(datetime.now()), "pulling new files to process..." files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] ) print "putting new files to process..." for i,l in enumerate(files_lists): print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files" for fp in l: if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS: FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1]) del files_lists, l pool.close() pool.terminate() print str(datetime.now()), "processing data." FileProcessLock.unlock() process_file_chunks()
def edit_admin(admin_id): admin = Admin(admin_id) admin_is_current_user = (admin._id == session['admin_username']) current_studies = sorted(Studies(admins=admin._id), key=lambda x: x.name.lower()) return render_template('edit_admin.html', admin=admin, current_studies=current_studies, all_studies=Studies.get_all_studies(), allowed_studies=get_admins_allowed_studies(), admin_is_current_user=admin_is_current_user, system_admin=admin_is_system_admin())
def render_edit_survey(survey_id=None): survey = Survey(survey_id) study = [ study for study in Studies() if survey['_id'] in study['surveys'] ][0] if not survey: return abort(404) return render_template('edit_survey.html', survey=survey, study=study, allowed_studies=get_admins_allowed_studies(), system_admin=admin_is_system_admin())
def get_studies(): # Cases: invalid access creds access_key = request.values["access_key"] access_secret = request.values["secret_key"] admin = Admin(access_key_id=access_key) if not admin: return abort(403) # access key DNE if not admin.validate_access_credentials(access_secret): return abort(403) # incorrect secret key return json.dumps({ str(study._id): study.name for study in Studies(admins=str(admin._id)) })
def manage_admins(): admins = [] for admin in Admins(): admin_name = admin._id allowed_studies = ' | '.join( sorted(Studies(admins=admin._id, field='name'), key=lambda x: x.lower())) admins.append((admin_name, allowed_studies)) admins = sorted(admins, key=lambda s: s[0].lower()) return render_template('manage_admins.html', admins=admins, allowed_studies=get_admins_allowed_studies(), system_admin=admin_is_system_admin())
def migrate_studies(): d_study_list = [] for m_study in MStudySet.iterator(): with error_handler: # Create a Django Study object modeled off the Mongolia Study study_name = m_study['name'] d_study = DStudy( name=study_name, encryption_key=m_study['encryption_key'], object_id=m_study['_id'], deleted=m_study['deleted'], is_test=m_study['is_test'], ) # Validate the new Study object and add it to the bulk create list d_study.full_clean() d_study_list.append(d_study) # Get lists of Mongolia Surveys, Admins and StudyDeviceSettings attached to this Study m_survey_list = m_study['surveys'] m_admin_list = m_study['admins'] m_device_settings = m_study['device_settings'] study_referents[study_name] = { 'survey_list': m_survey_list, 'admin_list': m_admin_list, 'device_settings': m_device_settings, } # Bulk create the Django Studies DStudy.objects.bulk_create(d_study_list) # Create a reference from Mongolia Study IDs to Django Studies that doesn't require # any future database calls. for m_study in MStudySet.iterator(): with error_handler: m_study_id = m_study['_id'] d_study_id = DStudy.objects.filter(name=m_study['name']).values('pk', 'deleted').get() study_id_dict[m_study_id] = d_study_id
def create_study(): if request.method == 'GET': return render_template('create_study.html', studies=Studies.get_all_studies(), allowed_studies=get_admins_allowed_studies(), system_admin=admin_is_system_admin()) name = request.form.get('name') encryption_key = request.form.get('encryption_key') try: study = Study.create_default_study(name, encryption_key) flash("Successfully created a new study.", 'success') copy_existing_study_if_asked_to(study) return redirect('/device_settings/' + str(study._id)) except (InvalidEncryptionKeyError, StudyAlreadyExistsError) as e: flash(e.message, 'danger') return redirect('/create_study')
def authenticate_and_call(*args, **kwargs): if not is_logged_in(): #check for regular login requirement return redirect("/") admin = Admin(session['admin_username']) if not admin["system_admin"]: # TODO: Low Priority. Josh. redirect to a URL, not a template file return abort(403) if 'study_id' in kwargs: study_id = kwargs['study_id'] if not isinstance(study_id, ObjectId): # make an extra check in case study_id = ObjectId( study_id) # authenticate_admin_study_access kwargs['study_id'] = study_id # has already converted the id. if not Studies(_id=study_id): return redirect("/") return some_function(*args, **kwargs)
def get_all_timings_files( ): # get users associated with studies study_users = { str( s._id ):Users( study_id=s._id, field='_id' ) for s in Studies( ) } all_user_timings = [] for sid, users in study_users.items( ): # construct prefixes all_user_timings.extend( [sid + "/" + u + "/" + "surveyTimings" for u in users] ) # use a threadpool to efficiently get all those strings of s3 paths we # will need pool = ThreadPool( len( all_user_timings ) ) try: files_lists = pool.map( s3_list_files, all_user_timings ) except Exception: raise finally: pool.close( ) pool.terminate( ) files_list = [] for l in files_lists: files_list.extend( l ) # we need to purge the occasional pre-multistudy file, and ensure it is utf encoded. return [f.decode( "utf8" ) for f in files_list if f.count( '/' ) == 4]
def manage_studies(): return render_template('manage_studies.html', studies=Studies.get_all_studies(), allowed_studies=get_admins_allowed_studies(), system_admin=admin_is_system_admin())
def get_creation_arguments(cls, params, file_object): errors = [] # ensure required are present, we don't allow falsey contents. for key in PipelineUpload.REQUIREDS: if not params.get(key, None): errors.append('missing required parameter: "%s"' % key) # if we escape here early we can simplify the code that requires all parameters later if errors: raise InvalidUploadParameterError("\n".join(errors)) # validate study_id study_id_object_id = ObjectId(params["study_id"]) if not Studies(_id=study_id_object_id): errors.append('encountered invalid study_id: "%s"' % params["study_id"] if params["study_id"] else None) print 'file_name' in params print params['file_name'] if len(params['file_name']) > 256: errors.append( "encountered invalid file_name, file_names cannot be more than 256 characters" ) if PipelineUploads.count(file_name=params['file_name']): errors.append('a file with the name "%s" already exists' % params['file_name']) try: tags = json.loads(params["tags"]) if not isinstance(tags, list): # must be json list, can't be json dict, number, or string. raise ValueError() if not tags: errors.append( "you must provide at least one tag for your file.") tags = [str(_) for _ in tags] except ValueError: errors.append( "could not parse tags, ensure that your uploaded list of tags is a json compatible array." ) if errors: raise InvalidUploadParameterError("\n".join(errors)) creation_time = datetime.utcnow() file_hash = low_memory_chunk_hash(file_object.read()) file_object.seek(0) s3_path = "%s/%s/%s/%s/%s" % ( PIPELINE_FOLDER, params["study_id"], params["file_name"], creation_time.isoformat(), ''.join( random.choice(string.ascii_letters + string.digits) for i in range(32)), # todo: file_name? ) return { "creation_time": creation_time, "s3_path": s3_path, "study_id": study_id_object_id, "tags": tags, "file_name": params["file_name"], "file_hash": file_hash, }
def get_admins_allowed_studies(): """ Return a list of studies which the currently logged-in admin is autho- rized to view and edit """ admin = Admin(session['admin_username']) return sorted(Studies(admins=admin._id), key=lambda x: x.name.lower())
print "migrate_upload_trackers..." migrate_upload_trackers() if __name__ == '__main__': study_referents = {} study_id_dict = {} user_id_dict = {} survey_id_dict = {} orphaned_surveys = {} d_study_admin_list = [] # A list of study-researcher pairs d_study_survey_dict = {} # A mapping of surveys to their associated studies d_study_settings_dict = {} # A mapping of device settings to their associated studies CHUNK_SIZE = 10000 # error_handler = ErrorHandler() error_handler = null_error_handler() print(MStudySet.count(), MSurveySet.count(), MSettingsSet.count(), MAdminSet.count(), MUserSet.count(), MChunkSet.count(), MUploadSet.count()) with error_handler: run_all_migrations() print(DStudy.objects.count(), DSurvey.objects.count(), DSettings.objects.count(), DAdmin.objects.count(), DUser.objects.count(), DChunk.objects.count(), DUpload.objects.count()) print("end:", datetime.now()) error_handler.raise_errors()