def reindex_all_files_to_process(): """ Totally removes the FilesToProcess DB, deletes all chunked files on s3, clears the chunksregistry, and then adds all relevent files on s3 to the files to process registry. """ FileProcessLock.lock() print str(datetime.now()), "purging FilesToProcess:", FilesToProcess.count() FileToProcess.db().drop() print str(datetime.now()), "purging existing ChunksRegistry", ChunksRegistry.count() ChunkRegistry.db().drop() pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2 ) print str(datetime.now()), "deleting older chunked data:", CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER) print len(CHUNKED_DATA) pool.map(s3_delete, CHUNKED_DATA) del CHUNKED_DATA print str(datetime.now()), "pulling new files to process..." files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] ) print "putting new files to process..." for i,l in enumerate(files_lists): print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files" for fp in l: if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS: FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1]) del files_lists, l pool.close() pool.terminate() print str(datetime.now()), "processing data." FileProcessLock.unlock() process_file_chunks()
def completely_purge_study(study_id, actually_delete=False): if not isinstance(study_id, ObjectId): study_id = ObjectId(study_id) study = Study(study_id) surveys = study["surveys"] device_settings = study["device_settings"] users = Users(study_id=study_id) chunks = ChunksRegistry(study_id=study_id) files_to_process = FilesToProcess(study_id=study_id) if not actually_delete: print "if you actually delete this you will not be able to decrypt anything " \ "from this study. Don't do it unless you know what you are doing." print study.name # print len(study) # print len(device_settings) print len(surveys) print len(users) print len(chunks) print len(files_to_process) else: StudyDeviceSettings(device_settings).remove() [Survey(s).remove() for s in surveys] [User(u).remove() for u in users] [ChunkRegistry(c).remove() for c in chunks] [FileToProcess(f).remove() for f in files_to_process] study.remove()
def get_user_list_safely(retries=10): """ This error started occurring on occasionally on Mar 22, 2017, we don't know why. """ try: return set(FilesToProcess(field="user_id")) except CursorNotFound: if retries < 1: raise print "encountered cursor error, retrying..." sleep(0.1) return get_user_list_safely(retries=retries - 1)
def process_file_chunks(): """ This is the function that is called from cron. It runs through all new files that have been uploaded and 'chunks' them. Handles logic for skipping bad files, raising errors appropriately. """ error_handler = ErrorHandler() if FileProcessLock.islocked(): raise ProcessingOverlapError( "Data processing overlapped with a previous data indexing run.") FileProcessLock.lock() number_bad_files = 0 user_ids = set(FilesToProcess(field="user_id")) print "processing files for the following users: %s" % ",".join(user_ids) for user_id in user_ids: while True: previous_number_bad_files = number_bad_files starting_length = FilesToProcess.count(user_id=user_id) print str(datetime.now()), "processing %s, %s files remaining" % ( user_id, starting_length) number_bad_files += do_process_user_file_chunks( count=FILE_PROCESS_PAGE_SIZE, error_handler=error_handler, skip_count=number_bad_files, user_id=user_id) if starting_length == FilesToProcess.count( user_id=user_id): #zero files processed if previous_number_bad_files == number_bad_files: # Cases: # every file broke, might as well fail here, and would cause infinite loop otherwise. # no new files. break else: continue FileProcessLock.unlock() error_handler.raise_errors() raise EverythingWentFine(DATA_PROCESSING_NO_ERROR_STRING)
def celery_process_file_chunks(user_id): """ This is the function that is called from cron. It runs through all new files that have been uploaded and 'chunks' them. Handles logic for skipping bad files, raising errors appropriately. """ log = LogList() number_bad_files = 0 error_sentry = ErrorSentry(SENTRY_DSN, sentry_client_kwargs={ "tags": { "user_id": user_id }, 'transport': HTTPTransport }) log.append("processing files for %s" % user_id) while True: previous_number_bad_files = number_bad_files starting_length = FilesToProcess.count(user_id=user_id) log.append( str(datetime.now()) + " processing %s, %s files remaining" % (user_id, starting_length)) number_bad_files += do_process_user_file_chunks( count=FILE_PROCESS_PAGE_SIZE, error_handler=error_sentry, skip_count=number_bad_files, user_id=user_id) if starting_length == FilesToProcess.count( user_id=user_id): # zero files processed if previous_number_bad_files == number_bad_files: # Cases: # every file broke, blow up. (would cause infinite loop otherwise) # no new files. break else: continue
"Go check on reindex operation.", source_email="*****@*****.**") for study_id in studies: if isinstance(study_id, (str, unicode)): study_id = ObjectId(study_id) study = Study(study_id) print "=============================================================" print "=============================================================" print "=============================================================" print "starting on %s, study id: %s" % (study.name, str(study_id)) print "=============================================================" print "=============================================================" print "=============================================================" study_id = ObjectId(study_id) try: reindex_study(study_id) except Exception as e: process_file_chunks( ) #will raise an error if things fail on second attempt if FilesToProcess.count() != 0: do_email(study) raise Exception("stopped on " + str(study_id)) email_system_administrators("OMG IT FINISHED AND EVERYTHING IS DONE.", "Go git checkout .; touch wsgi.py", source_email="*****@*****.**")
if __name__ == "__main__": from os.path import abspath as _abspath import imp as _imp _current_folder_init = _abspath(__file__).rsplit('/', 1)[0]+ "/__init__.py" _imp.load_source("__init__", _current_folder_init) from libs.s3 import s3_list_files from db.data_access_models import FileToProcess, FilesToProcess from bson import ObjectId study_id_obj = ObjectId("5873fe38644ad7557b168e43") study_id_str = str(study_id_obj) for purgeable in FilesToProcess(user_id='prx7ap5x'): purgeable.remove() for i, path in enumerate(s3_list_files(study_id_str , as_generator=True)): if i > 500: break if path[-3:] != 'csv': continue # skip if not a csv file... user_id = path[:-4].split('/')[1] path_sans_study = path.split("/", 1)[1] if FileToProcess(s3_file_path=path): print "%s already in FilesToProcess." % path continue FileToProcess.append_file_for_processing(path_sans_study, study_id_obj, user_id)
def do_process_user_file_chunks(count, error_handler, skip_count, user_id): """ Run through the files to process, pull their data, put it into s3 bins. Run the file through the appropriate logic path based on file type. If a file is empty put its ftp object to the empty_files_list, we can't delete objects in-place while iterating over the db. All files except for the audio recording files are in the form of CSVs, most of those files can be separated by "time bin" (separated into one-hour chunks) and concatenated and sorted trivially. A few files, call log, identifier file, and wifi log, require some triage beforehand. The debug log cannot be correctly sorted by time for all elements, because it was not actually expected to be used by researchers, but is apparently quite useful. Any errors are themselves concatenated using the passed in error handler. """ #this is how you declare a defaultdict containing a tuple of two deques. all_binified_data = defaultdict(lambda: (deque(), deque())) ftps_to_remove = set([]) pool = ThreadPool(CONCURRENT_NETWORK_OPS) survey_id_dict = {} for data in pool.map(batch_retrieve_for_processing, FilesToProcess(page_size=count + skip_count, user_id=user_id)[skip_count:], chunksize=1): with error_handler: #raise errors that we encountered in the s3 access threaded operations to the error_handler if data['exception']: print "\n" + data['ftp']['s3_file_path'] print data['traceback'] raise data['exception'] if data['chunkable']: # print "1a" newly_binified_data, survey_id_hash = process_csv_data(data) # print data, "\n1b" if data['data_type'] in SURVEY_DATA_FILES: # print survey_id_hash survey_id_dict[ survey_id_hash] = resolve_survey_id_from_file_name( data['ftp']["s3_file_path"]) if newly_binified_data: # print "1c" append_binified_csvs(all_binified_data, newly_binified_data, data['ftp']) else: # delete empty files from FilesToProcess # print "1d" ftps_to_remove.add(data['ftp']._id) continue else: #if not data['chunkable'] # print "2a" timestamp = clean_java_timecode( data['ftp']["s3_file_path"].rsplit("/", 1)[-1][:-4]) # print "2a" ChunkRegistry.add_new_chunk(data['ftp']["study_id"], data['ftp']["user_id"], data['data_type'], data['ftp']["s3_file_path"], timestamp) # print "2b" ftps_to_remove.add(data['ftp']._id) pool.close() pool.terminate() # print 3 more_ftps_to_remove, number_bad_files = upload_binified_data( all_binified_data, error_handler, survey_id_dict) # print "X" ftps_to_remove.update(more_ftps_to_remove) for ftp_id in ftps_to_remove: FileToProcess(ftp_id).remove() # print "Y" gc.collect() # print "Z" return number_bad_files