def re_add_files_to_process(cls, number=100): """ Re-adds the most recent [number] files that have been uploaded recently to FiletToProcess. (this is fairly optimized because it is part of debugging file processing) """ uploads = cls.objects.order_by("-created_on").values_list( "file_path", "participant__study__object_id", "participant_id")[:number] from database.data_access_models import FileToProcess # uhg need to cache participants... participant_cache = {} for i, (file_path, participant__study__object_id, participant_id) in enumerate(uploads): if participant_id in participant_cache: participant = participant_cache[participant_id] else: participant = Participant.objects.get(id=participant_id) participant_cache[participant_id] = participant if i % 10 == 0: print(i, sep="... ") if FileToProcess.objects.filter( s3_file_path__icontains=file_path).exists(): print(f"skipping {file_path}, appears to already be present") continue FileToProcess.append_file_for_processing( # file_path, study_object_id, **kwargs file_path, participant__study__object_id, participant=participant, )
def fix_duplicates(duplicate_chunks): for path in duplicate_chunks: # deconstruct relevant information from chunk path, clean it path_components = path.split("/") if len(path_components) == 5: _, study_obj_id, username, data_stream, timestamp = path.split("/") elif len(path_components) == 4: study_obj_id, username, data_stream, timestamp = path.split("/") else: print( "You appear to have an invalid file path. Please report this error to https://github.com/onnela-lab/beiwe-backend/issues" ) raise Exception("invalid_path: %s" % path) # not all files are chunkable, they will require different logic. if data_stream not in CHUNKABLE_FILES: remove_all_but_one_chunk(path) continue else: try: FileToProcess.reprocess_originals_from_chunk_path(path) except Exception as e: if "did not find any matching files" in str(e): pass else: raise remove_all_but_one_chunk(path)
def reindex_all_files_to_process(): """ Totally clears the FilesToProcess DB, deletes all chunked files on S3, clears the ChunksRegistry DB, reads all relevant files on S3 to the FilesToProcess registry and then re-chunks them. """ raise Exception( "This code has not been tested since converting database backends, that means 2018" ) # Delete all preexisting FTP and ChunkRegistry objects FileProcessLock.lock() print('{!s} purging FileToProcess: {:d}'.format( datetime.now(), FileToProcess.objects.count())) FileToProcess.objects.all().delete() print('{!s} purging ChunkRegistry: {:d}'.format( datetime.now(), ChunkRegistry.objects.count())) ChunkRegistry.objects.all().delete() pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2) # Delete all preexisting chunked data files CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER) print('{!s} deleting older chunked data: {:d}'.format( datetime.now(), len(CHUNKED_DATA))) pool.map(s3_delete, CHUNKED_DATA) del CHUNKED_DATA # Get a list of all S3 files to replace in the database print('{!s} pulling new files to process...'.format(datetime.now())) files_lists = pool.map(s3_list_files, Study.objects.values_list('object_id', flat=True)) # For each such file, create an FTP object print("putting new files to process...") for i, l in enumerate(files_lists): print('{!s} {:d} of {:d}, {:d} files'.format(datetime.now(), i + 1, Study.objects.count(), len(l))) for fp in l: if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS: patient_id = fp.split('/', 2)[1] participant_pk = Participant.objects.filter( patient_id=patient_id).values_list('pk', flat=True).get() FileToProcess.append_file_for_processing( fp, fp.split("/", 1)[0], participant_id=participant_pk) # Clean up by deleting large variables, closing the thread pool and unlocking the file process lock del files_lists, l pool.close() pool.terminate() FileProcessLock.unlock() # Rechunk the newly created FTPs print("{!s} processing data.".format(datetime.now())) process_file_chunks()
def create_fake_mp4(number=10): participant_id = Participant.objects.get(patient_id='h6fflp') for x in range(number): with open("thing", "r") as f: file_path = "55d3826297013e3a1c9b8c3e/h6fflp/voiceRecording/%s.mp4" % ( 1000000000 + x) s3_upload(file_path, f.read(), "55d3826297013e3a1c9b8c3e", raw_path=True) FileToProcess.append_file_for_processing( file_path, "55d3826297013e3a1c9b8c3e", participant_id=participant_id)
def batch_retrieve_for_processing(ftp_as_object: FileToProcess) -> dict: """ Used for mapping an s3_retrieve function. """ # Convert the ftp object to a dict so we can use __getattr__ ftp = ftp_as_object.as_dict() data_type = file_path_to_data_type(ftp['s3_file_path']) # Create a dictionary to populate and return ret = { 'ftp': ftp, "data_type": data_type, 'exception': None, "file_contents": "", "traceback": None, 'chunkable': data_type in CHUNKABLE_FILES, } # Try to retrieve the file contents. If any errors are raised, store them to be raised by the # parent function try: print(ftp['s3_file_path'] + ", getting data...") ret['file_contents'] = s3_retrieve(ftp['s3_file_path'], ftp["study"].object_id.encode(), raw_path=True) except Exception as e: traceback.print_exc() ret['traceback'] = sys.exc_info() ret['exception'] = e return ret
def reindex_specific_data_type(data_type): raise Exception( "This code has not been tested since converting database backends") FileProcessLock.lock() print("starting...") # Convert the data type; raise an error if something is wrong with it file_name_key = data_stream_to_s3_file_name_string(data_type) # Get all chunk paths of the given data type relevant_chunks = ChunkRegistry.objects.filter(data_type=data_type) # list() ensures that the QuerySet is evaluated before all of its elements are deleted (otherwise it would be empty) relevant_indexed_files = list( relevant_chunks.values_list('chunk_path', flat=True)) # Delete the old ChunkRegistry objects print("purging old data...") relevant_chunks.delete() pool = ThreadPool(20) pool.map(s3_delete, relevant_indexed_files) print("pulling files to process...") files_lists = pool.map(s3_list_files, Study.objects.values_list('object_id', flat=True)) for i, l in enumerate(files_lists): print('{!s} {:d} of {:d}, {:d} files'.format(datetime.now(), i + 1, Study.objects.count(), len(l))) for fp in l: if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS: patient_id = fp.split('/', 2)[1] participant_pk = Participant.objects.filter( patient_id=patient_id).values_list('pk', flat=True).get() FileToProcess.append_file_for_processing( fp, fp.split("/", 1)[0], participant_id=participant_pk) del files_lists, l pool.close() pool.terminate() FileProcessLock.unlock() print("{!s} processing data.".format(datetime.now())) process_file_chunks() print("Done.")
def add_files_to_process2(cls, limit=25): """ Re-adds the most recent [limit] files that have been uploaded recently to FiletToProcess. (this is fairly optimized because it is part of debugging file processing) """ from database.data_access_models import FileToProcess upload_queries = [] for ds in DATA_STREAM_TO_S3_FILE_NAME_STRING.values(): if ds == "identifiers": continue query = (cls.objects.order_by("-created_on").filter( file_path__contains=ds).values_list( "file_path", "participant__study_id", "participant__study__object_id", "participant_id")[:limit]) upload_queries.append((ds, query)) new_ftps = [] # participant_cache = {} # uhg need to cache participants... file_paths_wandered = set( FileToProcess.objects.values_list("s3_file_path", flat=True)) for file_type, uploads_query in upload_queries: print(file_type) for i, (file_path, study_id, object_id, participant_id) in enumerate(uploads_query): if i % 10 == 0 or i == limit - 1: print( i + 1 if i == limit - 1 else i, sep="... ", ) if file_path in file_paths_wandered: continue else: file_paths_wandered.add(file_path) new_ftps.append( FileToProcess(s3_file_path=object_id + "/" + file_path, study_id=study_id, participant_id=participant_id)) FileToProcess.objects.bulk_create(new_ftps)
def upload(OS_API=""): """ Entry point to upload GPS, Accelerometer, Audio, PowerState, Calls Log, Texts Log, Survey Response, and debugging files to s3. Behavior: The Beiwe app is supposed to delete the uploaded file if it receives an html 200 response. The API returns a 200 response when the file has A) been successfully handled, B) the file it has been sent is empty, C) the file did not decrypt properly. We encountered problems in production with incorrectly encrypted files (as well as Android generating "rList" files under unknown circumstances) and the app then uploads them. When the device receives a 200 that is its signal to delete the file. When a file is undecryptable (this was tracked to a scenario where the device could not create/write an AES encryption key) we send a 200 response to stop that device attempting to re-upload the data. In the event of a single line being undecryptable (can happen due to io errors on the device) we drop only that line (and store the erroring line in an attempt to track it down. A 400 error means there is something is wrong with the uploaded file or its parameters, administrators will be emailed regarding this upload, the event will be logged to the apache log. The app should not delete the file, it should try to upload it again at some point. If a 500 error occurs that means there is something wrong server side, administrators will be emailed and the event will be logged. The app should not delete the file, it should try to upload it again at some point. Request format: send an http post request to [domain name]/upload, remember to include security parameters (see user_authentication for documentation). Provide the contents of the file, encrypted (see encryption specification) and properly converted to Base64 encoded text, as a request parameter entitled "file". Provide the file name in a request parameter entitled "file_name". """ # Handle these corner cases first because they requires no database input. # Crash logs are from truly ancient versions of the android codebase # rList are randomly generated by android # PersistedInstallation files come from firebase. # todo: stop uploading junk files in the app by putting our files into a folder. file_name = request.values.get("file_name", None) if ( not bool(file_name) or file_name.startswith("rList") or file_name.startswith("PersistedInstallation") or not contains_valid_extension(file_name) ): return render_template('blank.html'), 200 s3_file_location = file_name.replace("_", "/") participant = get_session_participant() if participant.unregistered: # "Unregistered" participants are blocked from uploading further data. # If the participant is unregistered, throw away the data file, but # return a 200 "OK" status to the phone so the phone decides it can # safely delete the file. return render_template('blank.html'), 200 # block duplicate FTPs. Testing the upload history is too complex if FileToProcess.test_file_path_exists(s3_file_location, participant.study.object_id): return render_template('blank.html'), 200 uploaded_file = get_uploaded_file() try: uploaded_file = decrypt_device_file(uploaded_file, participant) except HandledError: return render_template('blank.html'), 200 except DecryptionKeyInvalidError: # when the decryption key is invalid the file is lost. Nothing we can do. # record the event, send the device a 200 so it can clear out the file. if REPORT_DECRYPTION_KEY_ERRORS: tags = { "participant": participant.patient_id, "operating system": "ios" if "ios" in request.path.lower() else "android", "DecryptionKeyError id": str(DecryptionKeyError.objects.last().id), "file_name": file_name, "bug_report": DECRYPTION_KEY_ADDITIONAL_MESSAGE, } sentry_client = make_sentry_client(SentryTypes.elastic_beanstalk, tags) sentry_client.captureMessage(DECRYPTION_KEY_ERROR_MESSAGE) return render_template('blank.html'), 200 # if uploaded data actually exists, and has a valid extension if uploaded_file and file_name and contains_valid_extension(file_name): s3_upload(s3_file_location, uploaded_file, participant.study.object_id) # race condition: multiple _concurrent_ uploads with same file path. Behavior without # try-except is correct, but we don't care about reporting it. Just send the device a 500 # error so it skips the file, the followup attempt receives 200 code and deletes the file. try: FileToProcess.append_file_for_processing( s3_file_location, participant.study.object_id, participant=participant ) except ValidationError as e: # Real error is a second validation inside e.error_dict["s3_file_path"]. # Ew; just test for this string instead... if S3_FILE_PATH_UNIQUE_CONSTRAINT_ERROR in str(e): # this tells the device to just move on to the next file, try again later. return abort(500) else: raise UploadTracking.objects.create( file_path=s3_file_location, file_size=len(uploaded_file), timestamp=timezone.now(), participant=participant, ) return render_template('blank.html'), 200 elif not uploaded_file: # if the file turns out to be empty, delete it, we simply do not care. return render_template('blank.html'), 200 else: return make_upload_error_report(participant.patient_id, file_name)
def register_user(OS_API=""): """ Checks that the patient id has been granted, and that there is no device registered with that id. If the patient id has no device registered it registers this device and logs the bluetooth mac address. Check the documentation in user_authentication to ensure you have provided the proper credentials. Returns the encryption key for this patient/user. """ # CASE: If the id and password combination do not match, the decorator returns a 403 error. # the following parameter values are required. patient_id = request.values['patient_id'] phone_number = request.values['phone_number'] device_id = request.values['device_id'] # These values may not be returned by earlier versions of the beiwe app device_os = request.values.get('device_os', "none") os_version = request.values.get('os_version', "none") product = request.values.get("product", "none") brand = request.values.get("brand", "none") hardware_id = request.values.get("hardware_id", "none") manufacturer = request.values.get("manufacturer", "none") model = request.values.get("model", "none") beiwe_version = request.values.get("beiwe_version", "none") # This value may not be returned by later versions of the beiwe app. mac_address = request.values.get('bluetooth_id', "none") participant = get_session_participant() if participant.device_id and participant.device_id != request.values['device_id']: # CASE: this patient has a registered a device already and it does not match this device. # They need to contact the study and unregister their their other device. The device # will receive a 405 error and should alert the user accordingly. # Provided a user does not completely reset their device (which resets the device's # unique identifier) they user CAN reregister an existing device, the unlock key they # need to enter to at registration is their old password. # KG: 405 is good for IOS and Android, no need to check OS_API return abort(405) if participant.os_type and participant.os_type != OS_API: # CASE: this patient has registered, but the user was previously registered with a # different device type. To keep the CSV munging code sane and data consistent (don't # cross the iOS and Android data streams!) we disallow it. return abort(400) # At this point the device has been checked for validity and will be registered successfully. # Any errors after this point will be server errors and return 500 codes. the final return # will be the encryption key associated with this user. # Upload the user's various identifiers. unix_time = str(calendar.timegm(time.gmtime())) file_name = patient_id + '/identifiers_' + unix_time + ".csv" # Construct a manual csv of the device attributes file_contents = (DEVICE_IDENTIFIERS_HEADER + "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" % (patient_id, mac_address, phone_number, device_id, device_os, os_version, product, brand, hardware_id, manufacturer, model, beiwe_version)).encode() s3_upload(file_name, file_contents, participant.study.object_id) FileToProcess.append_file_for_processing(file_name, participant.study.object_id, participant=participant) # set up device. participant.device_id = device_id participant.os_type = OS_API participant.set_password(request.values['new_password']) # set password saves the model device_settings = participant.study.device_settings.as_unpacked_native_python() device_settings.pop('_id', None) # set up FCM files firebase_plist_data = None firebase_json_data = None if participant.os_type == 'IOS': ios_credentials = FileAsText.objects.filter(tag=IOS_FIREBASE_CREDENTIALS).first() if ios_credentials: firebase_plist_data = plistlib.loads(ios_credentials.text.encode()) elif participant.os_type == 'ANDROID': android_credentials = FileAsText.objects.filter(tag=ANDROID_FIREBASE_CREDENTIALS).first() if android_credentials: firebase_json_data = json.loads(android_credentials.text) # ensure the survey schedules are updated for this participant. repopulate_all_survey_scheduled_events(participant.study, participant) return_obj = { 'client_public_key': get_client_public_key_string(patient_id, participant.study.object_id), 'device_settings': device_settings, 'ios_plist': firebase_plist_data, 'android_firebase_json': firebase_json_data, 'study_name': participant.study.name, 'study_id': participant.study.object_id, } return json.dumps(return_obj), 200
def upload(OS_API=""): """ Entry point to upload GPS, Accelerometer, Audio, PowerState, Calls Log, Texts Log, Survey Response, and debugging files to s3. Behavior: The Beiwe app is supposed to delete the uploaded file if it receives an html 200 response. The API returns a 200 response when the file has A) been successfully handled, B) the file it has been sent is empty, C) the file did not decrypt properly. We encountered problems in production with incorrectly encrypted files (as well as Android generating "rList" files under unknown circumstances) and the app then uploads them. When the device receives a 200 that is its signal to delete the file. When a file is undecryptable (this was tracked to a scenario where the device could not create/write an AES encryption key) we send a 200 response to stop that device attempting to re-upload the data. In the event of a single line being undecryptable (can happen due to io errors on the device) we drop only that line (and store the erroring line in an attempt to track it down. A 400 error means there is something is wrong with the uploaded file or its parameters, administrators will be emailed regarding this upload, the event will be logged to the apache log. The app should not delete the file, it should try to upload it again at some point. If a 500 error occurs that means there is something wrong server side, administrators will be emailed and the event will be logged. The app should not delete the file, it should try to upload it again at some point. Request format: send an http post request to [domain name]/upload, remember to include security parameters (see user_authentication for documentation). Provide the contents of the file, encrypted (see encryption specification) and properly converted to Base64 encoded text, as a request parameter entitled "file". Provide the file name in a request parameter entitled "file_name". """ patient_id = request.values['patient_id'] user = Participant.objects.get(patient_id=patient_id) # Slightly different values for iOS vs Android behavior. # Android sends the file data as standard form post parameter (request.values) # iOS sends the file as a multipart upload (so ends up in request.files) # if neither is found, consider the "body" of the post the file # ("body" post is not currently used by any client, only here for completeness) if "file" in request.files: uploaded_file = request.files['file'] elif "file" in request.values: uploaded_file = request.values['file'] else: uploaded_file = request.data if isinstance(uploaded_file, FileStorage): uploaded_file = uploaded_file.read() file_name = request.values['file_name'] # print "uploaded file name:", file_name, len(uploaded_file) if "crashlog" in file_name.lower(): send_android_error_report(patient_id, uploaded_file) return render_template('blank.html'), 200 if file_name[:6] == "rList-": return render_template('blank.html'), 200 client_private_key = get_client_private_key(patient_id, user.study.object_id) try: uploaded_file = decrypt_device_file(patient_id, uploaded_file, client_private_key, user) except HandledError as e: # when decrypting fails, regardless of why, we rely on the decryption code # to log it correctly and return 200 OK to get the device to delete the file. # We do not want emails on these types of errors, so we use log_error explicitly. print("the following error was handled:") log_error(e, "%s; %s; %s" % (patient_id, file_name, e.message)) return render_template('blank.html'), 200 #This is what the decryption failure mode SHOULD be, but we are still identifying the decryption bug except DecryptionKeyInvalidError: tags = { "participant": patient_id, "operating system": "ios" if "ios" in request.path.lower() else "android", "DecryptionKeyError id": str(DecryptionKeyError.objects.last().id), "file_name": file_name, } make_sentry_client('eb', tags).captureMessage("DecryptionKeyInvalidError") return render_template('blank.html'), 200 # print "decryption success:", file_name # if uploaded data a) actually exists, B) is validly named and typed... if uploaded_file and file_name and contains_valid_extension(file_name): s3_upload(file_name.replace("_", "/"), uploaded_file, user.study.object_id) FileToProcess.append_file_for_processing(file_name.replace("_", "/"), user.study.object_id, participant=user) UploadTracking.objects.create( file_path=file_name.replace("_", "/"), file_size=len(uploaded_file), timestamp=timezone.now(), participant=user, ) return render_template('blank.html'), 200 else: error_message = "an upload has failed " + patient_id + ", " + file_name + ", " if not uploaded_file: # it appears that occasionally the app creates some spurious files # with a name like "rList-org.beiwe.app.LoadingActivity" error_message += "there was no/an empty file, returning 200 OK so device deletes bad file." log_error(Exception("upload error"), error_message) return render_template('blank.html'), 200 elif not file_name: error_message += "there was no provided file name, this is an app error." elif file_name and not contains_valid_extension(file_name): error_message += "contains an invalid extension, it was interpretted as " error_message += grab_file_extension(file_name) else: error_message += "AN UNKNOWN ERROR OCCURRED." tags = {"upload_error": "upload error", "user_id": patient_id} sentry_client = make_sentry_client('eb', tags) sentry_client.captureMessage(error_message) return abort(400)
def register_user(OS_API=""): """ Checks that the patient id has been granted, and that there is no device registered with that id. If the patient id has no device registered it registers this device and logs the bluetooth mac address. Check the documentation in user_authentication to ensure you have provided the proper credentials. Returns the encryption key for this patient/user. """ #CASE: If the id and password combination do not match, the decorator returns a 403 error. #the following parameter values are required. patient_id = request.values['patient_id'] phone_number = request.values['phone_number'] device_id = request.values['device_id'] # These values may not be returned by earlier versions of the beiwe app try: device_os = request.values['device_os'] except BadRequestKeyError: device_os = "none" try: os_version = request.values['os_version'] except BadRequestKeyError: os_version = "none" try: product = request.values["product"] except BadRequestKeyError: product = "none" try: brand = request.values["brand"] except BadRequestKeyError: brand = "none" try: hardware_id = request.values["hardware_id"] except BadRequestKeyError: hardware_id = "none" try: manufacturer = request.values["manufacturer"] except BadRequestKeyError: manufacturer = "none" try: model = request.values["model"] except BadRequestKeyError: model = "none" try: beiwe_version = request.values["beiwe_version"] except BadRequestKeyError: beiwe_version = "none" # This value may not be returned by later versions of the beiwe app. try: mac_address = request.values['bluetooth_id'] except BadRequestKeyError: mac_address = "none" user = Participant.objects.get(patient_id=patient_id) study_id = user.study.object_id if user.device_id and user.device_id != request.values['device_id']: # CASE: this patient has a registered a device already and it does not match this device. # They need to contact the study and unregister their their other device. The device # will receive a 405 error and should alert the user accordingly. # Provided a user does not completely reset their device (which resets the device's # unique identifier) they user CAN reregister an existing device, the unlock key they # need to enter to at registration is their old password. # KG: 405 is good for IOS and Android, no need to check OS_API return abort(405) if user.os_type and user.os_type != OS_API: # CASE: this patient has registered, but the user was previously registered with a # different device type. To keep the CSV munging code sane and data consistent (don't # cross the iOS and Android data streams!) we disallow it. return abort(400) # At this point the device has been checked for validity and will be registered successfully. # Any errors after this point will be server errors and return 500 codes. the final return # will be the encryption key associated with this user. # Upload the user's various identifiers. unix_time = str(calendar.timegm(time.gmtime())) file_name = patient_id + '/identifiers_' + unix_time + ".csv" # Construct a manual csv of the device attributes file_contents = (DEVICE_IDENTIFIERS_HEADER + "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" % (patient_id, mac_address, phone_number, device_id, device_os, os_version, product, brand, hardware_id, manufacturer, model, beiwe_version)) # print(file_contents + "\n") s3_upload(file_name, file_contents, study_id) FileToProcess.append_file_for_processing(file_name, user.study.object_id, participant=user) # set up device. user.set_device(device_id) user.set_os_type(OS_API) user.set_password(request.values['new_password']) device_settings = user.study.device_settings.as_native_python() device_settings.pop('_id', None) return_obj = { 'client_public_key': get_client_public_key_string(patient_id, study_id), 'device_settings': device_settings } return json.dumps(return_obj), 200