def reindex_specific_data_type(data_type):
    FileProcessLock.lock()
    print "starting..."
    #this line will raise an error if something is wrong with the data type
    file_name_key = data_stream_to_s3_file_name_string(data_type)
    relevant_chunks = ChunksRegistry(data_type=data_type)
    relevant_indexed_files = [ chunk["chunk_path"] for chunk in relevant_chunks ]
    print "purging old data..."
    for chunk in relevant_chunks: chunk.remove()

    pool = ThreadPool(20)
    pool.map(s3_delete, relevant_indexed_files)

    print "pulling files to process..."
    files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] )
    for i,l in enumerate(files_lists):
        print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files"
        for fp in l:
            if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS:
                FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1])
    del files_lists, l
    pool.close()
    pool.terminate()
    print str(datetime.now()), "processing data..."
    FileProcessLock.unlock()
    process_file_chunks()
    print "Done."
Exemplo n.º 2
0
def do_upload(file_paths_and_contents, data_type=None, forcibly_overwrite=False):
    if data_type == None: raise Exception("DATA TYPE!")
    upload_stream_map = { "survey_answers":("surveyAnswers", "csv"),
                          "audio":("voiceRecording", "mp4") }
    data_stream_string, file_extension = upload_stream_map[data_type]

    for timings_path, contents_and_timestamp in file_paths_and_contents.items():
        contents, timestamp = contents_and_timestamp
        study_id_string, user_id, _, survey_id, _ = timings_path.split("/")
        try:
            timestamp_string = str( int( mktime( timestamp.timetuple( ) ) ) ) + "000"
        except AttributeError:
            print "PROBLEM WITH TIMESTAMP FROM: %s" % timings_path
            continue
        if len(timestamp_string) != 13:
            raise Exception("LOL! No.")

        study_obj_id = Study(ObjectId(study_id_string))._id

        s3_file_path = "%s/%s/%s/%s/%s.%s" % (study_id_string,
                                              user_id,
                                              data_stream_string,
                                              survey_id,
                                              timestamp_string,
                                              file_extension)
        if len(s3_list_files(s3_file_path)) != 0:
            print "ALREADY_EXISTS: %s, %s" % (timings_path, s3_file_path)
            if forcibly_overwrite == False:
                continue
        else: print "yay!: ", s3_file_path
        contents = contents.encode("utf8") #maybe make this unicode-16?

        s3_upload(s3_file_path, contents, study_obj_id, raw_path=True)
        FileToProcess.append_file_for_processing( s3_file_path, study_obj_id, user_id )
def reindex_all_files_to_process():
    """ Totally removes the FilesToProcess DB, deletes all chunked files on s3,
    clears the chunksregistry, and then adds all relevent files on s3 to the
    files to process registry. """
    FileProcessLock.lock()
    print str(datetime.now()), "purging FilesToProcess:", FilesToProcess.count()
    FileToProcess.db().drop()
    print str(datetime.now()), "purging existing ChunksRegistry", ChunksRegistry.count()
    ChunkRegistry.db().drop()

    pool = ThreadPool(CONCURRENT_NETWORK_OPS * 2 )

    print str(datetime.now()), "deleting older chunked data:",
    CHUNKED_DATA = s3_list_files(CHUNKS_FOLDER)
    print len(CHUNKED_DATA)
    pool.map(s3_delete, CHUNKED_DATA)
    del CHUNKED_DATA

    print str(datetime.now()), "pulling new files to process..."
    files_lists = pool.map(s3_list_files, [str(s._id) for s in Studies()] )
    print "putting new files to process..."
    for i,l in enumerate(files_lists):
        print str(datetime.now()), i+1, "of", str(Studies.count()) + ",", len(l), "files"
        for fp in l:
            if fp[-4:] in PROCESSABLE_FILE_EXTENSIONS:
                FileToProcess.append_file_for_processing(fp, ObjectId(fp.split("/", 1)[0]), fp.split("/", 2)[1])
    del files_lists, l
    pool.close()
    pool.terminate()
    print str(datetime.now()), "processing data."
    FileProcessLock.unlock()
    process_file_chunks()
def completely_purge_study(study_id, actually_delete=False):
    if not isinstance(study_id, ObjectId):
        study_id = ObjectId(study_id)
    study = Study(study_id)

    surveys = study["surveys"]
    device_settings = study["device_settings"]
    users = Users(study_id=study_id)
    chunks = ChunksRegistry(study_id=study_id)
    files_to_process = FilesToProcess(study_id=study_id)
    if not actually_delete:
        print "if you actually delete this you will not be able to decrypt anything " \
              "from this study.  Don't do it unless you know what you are doing."
        print study.name
        # print len(study)
        # print len(device_settings)
        print len(surveys)
        print len(users)
        print len(chunks)
        print len(files_to_process)
    else:
        StudyDeviceSettings(device_settings).remove()
        [Survey(s).remove() for s in surveys]
        [User(u).remove() for u in users]
        [ChunkRegistry(c).remove() for c in chunks]
        [FileToProcess(f).remove() for f in files_to_process]
        study.remove()
Exemplo n.º 5
0
def upload(OS_API=""):
    """ Entry point to upload GPS, Accelerometer, Audio, PowerState, Calls Log, Texts Log,
    Survey Response, and debugging files to s3.

    Behavior:
    The Beiwe app is supposed to delete the uploaded file if it receives an html 200 response.
    The API returns a 200 response when the file has A) been successfully handled, B) the file it
    has been sent is empty, C) the file did not decrypt properly.  We encountered problems in
    production with incorrectly encrypted files (as well as Android generating "rList" files
    under unknown circumstances) and the app then uploads them.  The source of encryption errors
    is not well understood and could not be tracked down.  In order to salvage partial data the
    server decrypts files to the best of its ability and uploads it to S3.  In order to delete
    these files we still send a 200 response.

    (The above about encryption is awful, in a theoretical version 2.0 the 200 response would be
    replaced with a difference response code to allow for better debugging and less/fewer ... hax.)

    A 400 error means there is something is wrong with the uploaded file or its parameters,
    administrators will be emailed regarding this upload, the event will be logged to the apache
    log.  The app should not delete the file, it should try to upload it again at some point.

    If a 500 error occurs that means there is something wrong server side, administrators will be
    emailed and the event will be logged. The app should not delete the file, it should try to
    upload it again at some point.

    Request format:
    send an http post request to studies.beiwe.org/upload, remember to include security
    parameters (see user_authentication for documentation). Provide the contents of the file,
    encrypted (see encryption specification) and properly converted to Base64 encoded text,
    as a request parameter entitled "file".
    Provide the file name in a request parameter entitled "file_name". """
    patient_id = request.values['patient_id']
    user = User(patient_id)

    #Slightly different values for iOS vs Android behavior.
    #Android sends the file data as standard form post parameter (request.values)
    #iOS sends the file as a multipart upload (so ends up in request.files)
    #if neither is found, consider the "body" of the post the file
    #("body" post is not currently used by any client, only here for completeness)
    if "file" in request.files:
        uploaded_file = request.files['file']
    elif "file" in request.values:
        uploaded_file = request.values['file']
    else:
        uploaded_file = request.data

    if isinstance(uploaded_file, FileStorage):
        uploaded_file = uploaded_file.read()

    file_name = request.values['file_name']
    #     print "uploaded file name:", file_name, len(uploaded_file)
    if "crashlog" in file_name.lower():
        send_android_error_report(user._id, uploaded_file)
        return render_template('blank.html'), 200

    if file_name[:6] == "rList-":
        return render_template('blank.html'), 200

    client_private_key = get_client_private_key(patient_id, user['study_id'])
    try:
        uploaded_file = decrypt_device_file(patient_id, uploaded_file,
                                            client_private_key, user)
    except HandledError as e:
        # when decrypting fails, regardless of why, we rely on the decryption code
        # to log it correctly and return 200 OK to get the device to delete the file.
        # We do not want emails on these types of errors, so we use log_error explicitly.
        print "the following error was handled:"
        log_error(e, "%s; %s; %s" % (patient_id, file_name, e.message))
        return render_template('blank.html'), 200
    # except DecryptionKeyInvalidError:
    #     return render_template('blank.html'), 200
    except OurBase64Error:
        if IS_STAGING:
            print "decryption problems" + "#" * 200
            print
            print patient_id
            print
            print file_name
            print uploaded_file
            print
        raise

    print "decryption success:", file_name
    #if uploaded data a) actually exists, B) is validly named and typed...
    if uploaded_file and file_name and contains_valid_extension(file_name):
        s3_upload(file_name.replace("_", "/"), uploaded_file, user["study_id"])
        FileToProcess.append_file_for_processing(file_name.replace("_", "/"),
                                                 user["study_id"], patient_id)
        UploadTracking.create(
            {
                "file_path": file_name.replace("_", "/"),
                "timestamp": datetime.utcnow(),
                "user_id": patient_id,
                "file_size": len(uploaded_file)
            },
            random_id=True)
        return render_template('blank.html'), 200

    else:
        error_message = "an upload has failed " + patient_id + ", " + file_name + ", "
        if not uploaded_file:
            #it appears that occasionally the app creates some spurious files
            #with a name like "rList-org.beiwe.app.LoadingActivity"
            error_message += "there was no/an empty file, returning 200 OK so device deletes bad file."
            log_error(Exception("upload error"), error_message)
            return render_template('blank.html'), 200

        elif not file_name:
            error_message += "there was no provided file name, this is an app error."
        elif file_name and not contains_valid_extension(file_name):
            error_message += "contains an invalid extension, it was interpretted as "
            error_message += grab_file_extension(file_name)
        else:
            error_message += "AN UNKNOWN ERROR OCCURRED."

        sentry_client = SentryClient(dsn=SENTRY_DSN,
                                     tags={
                                         "upload_error": "upload error",
                                         "user_id": user._id
                                     },
                                     transport=HTTPTransport)
        sentry_client.captureMessage(error_message)

        # log_and_email_500_error(Exception("upload error"), error_message)
        return abort(400)
Exemplo n.º 6
0
def register_user(OS_API=""):
    """ Checks that the patient id has been granted, and that there is no device registered with
    that id.  If the patient id has no device registered it registers this device and logs the
    bluetooth mac address.
    Check the documentation in user_authentication to ensure you have provided the proper credentials.
    Returns the encryption key for this patient/user. """

    #CASE: If the id and password combination do not match, the decorator returns a 403 error.
    #the following parameter values are required.
    patient_id = request.values['patient_id']
    phone_number = request.values['phone_number']
    device_id = request.values['device_id']

    #These values may not be returned by earlier versions of the beiwe app
    try:
        device_os = request.values['device_os']
    except BadRequestKeyError:
        device_os = "none"
    try:
        os_version = request.values['os_version']
    except BadRequestKeyError:
        os_version = "none"
    try:
        product = request.values["product"]
    except BadRequestKeyError:
        product = "none"
    try:
        brand = request.values["brand"]
    except BadRequestKeyError:
        brand = "none"
    try:
        hardware_id = request.values["hardware_id"]
    except BadRequestKeyError:
        hardware_id = "none"
    try:
        manufacturer = request.values["manufacturer"]
    except BadRequestKeyError:
        manufacturer = "none"
    try:
        model = request.values["model"]
    except BadRequestKeyError:
        model = "none"
    try:
        beiwe_version = request.values["beiwe_version"]
    except BadRequestKeyError:
        beiwe_version = "none"
    #This value may not be returned by later versions of the beiwe app.
    try:
        mac_address = request.values['bluetooth_id']
    except BadRequestKeyError:
        mac_address = "none"

    user = User(patient_id)
    study_id = user['study_id']

    if user['device_id'] is not None and user['device_id'] != request.values[
            'device_id']:
        # CASE: this patient has a registered a device already and it does not match this device.
        #   They need to contact the study and unregister their their other device.  The device
        #   will receive a 405 error and should alert the user accordingly.
        # Provided a user does not completely reset their device (which resets the device's
        # unique identifier) they user CAN reregister an existing device, the unlock key they
        # need to enter to at registration is their old password.
        # KG: 405 is good for IOS and Android, no need to check OS_API
        return abort(405)

    if user['os_type'] is not None and user['os_type'] != OS_API:
        # CASE: this patient has registered, but the user was previously registered with a
        # different device type. To keep the CSV munging code sane and data consistent (don't
        # cross the iOS and Android data streams!) we disallow it.
        return abort(400)

    # At this point the device has been checked for validity and will be registered successfully.
    # Any errors after this point will be server errors and return 500 codes. the final return
    # will be the encryption key associated with this user.

    #Upload the user's various identifiers.
    unix_time = str(calendar.timegm(time.gmtime()))
    file_name = patient_id + '/identifiers_' + unix_time + ".csv"
    #construct a manual csv of the device attributes
    file_contents = (DEVICE_IDENTIFIERS_HEADER +
                     "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" %
                     (patient_id, mac_address, phone_number, device_id,
                      device_os, os_version, product, brand, hardware_id,
                      manufacturer, model, beiwe_version))
    # print file_contents, "\n"
    s3_upload(file_name, file_contents, study_id)
    FileToProcess.append_file_for_processing(file_name, user['study_id'],
                                             patient_id)

    # set up device.
    user.set_device(device_id)
    user.set_os_type(OS_API)
    User(patient_id).set_password(request.values['new_password'])
    device_settings = Study(study_id).get_study_device_settings()
    device_settings.pop('_id', None)
    return_obj = {
        'client_public_key':
        get_client_public_key_string(patient_id, study_id),
        'device_settings': device_settings
    }
    return json.dumps(return_obj), 200
if __name__ == "__main__":
    from os.path import abspath as _abspath
    import imp as _imp
    _current_folder_init = _abspath(__file__).rsplit('/', 1)[0]+ "/__init__.py"
    _imp.load_source("__init__", _current_folder_init)

from libs.s3 import s3_list_files
from db.data_access_models import FileToProcess, FilesToProcess
from bson import ObjectId

study_id_obj = ObjectId("5873fe38644ad7557b168e43")
study_id_str = str(study_id_obj)

for purgeable in FilesToProcess(user_id='prx7ap5x'):
    purgeable.remove()

for i, path in enumerate(s3_list_files(study_id_str , as_generator=True)):
    if i > 500:
        break
    if path[-3:] != 'csv':
        continue # skip if not a csv file...
    user_id = path[:-4].split('/')[1]
    path_sans_study = path.split("/", 1)[1]
    if FileToProcess(s3_file_path=path):
        print "%s already in FilesToProcess." % path
        continue
    FileToProcess.append_file_for_processing(path_sans_study, study_id_obj, user_id)
Exemplo n.º 8
0
def do_process_user_file_chunks(count, error_handler, skip_count, user_id):
    """
    Run through the files to process, pull their data, put it into s3 bins. Run the file through
    the appropriate logic path based on file type.

    If a file is empty put its ftp object to the empty_files_list, we can't delete objects
    in-place while iterating over the db.

    All files except for the audio recording files are in the form of CSVs, most of those files
    can be separated by "time bin" (separated into one-hour chunks) and concatenated and sorted
    trivially. A few files, call log, identifier file, and wifi log, require some triage
    beforehand.  The debug log cannot be correctly sorted by time for all elements, because it
    was not actually expected to be used by researchers, but is apparently quite useful.

    Any errors are themselves concatenated using the passed in error handler.
    """
    #this is how you declare a defaultdict containing a tuple of two deques.
    all_binified_data = defaultdict(lambda: (deque(), deque()))
    ftps_to_remove = set([])
    pool = ThreadPool(CONCURRENT_NETWORK_OPS)
    survey_id_dict = {}

    for data in pool.map(batch_retrieve_for_processing,
                         FilesToProcess(page_size=count + skip_count,
                                        user_id=user_id)[skip_count:],
                         chunksize=1):
        with error_handler:
            #raise errors that we encountered in the s3 access threaded operations to the error_handler
            if data['exception']:
                print "\n" + data['ftp']['s3_file_path']
                print data['traceback']
                raise data['exception']

            if data['chunkable']:
                # print "1a"
                newly_binified_data, survey_id_hash = process_csv_data(data)
                # print data, "\n1b"
                if data['data_type'] in SURVEY_DATA_FILES:
                    # print survey_id_hash
                    survey_id_dict[
                        survey_id_hash] = resolve_survey_id_from_file_name(
                            data['ftp']["s3_file_path"])
                if newly_binified_data:
                    # print "1c"
                    append_binified_csvs(all_binified_data,
                                         newly_binified_data, data['ftp'])
                else:  # delete empty files from FilesToProcess
                    # print "1d"
                    ftps_to_remove.add(data['ftp']._id)
                continue

            else:  #if not data['chunkable']
                # print "2a"
                timestamp = clean_java_timecode(
                    data['ftp']["s3_file_path"].rsplit("/", 1)[-1][:-4])
                # print "2a"
                ChunkRegistry.add_new_chunk(data['ftp']["study_id"],
                                            data['ftp']["user_id"],
                                            data['data_type'],
                                            data['ftp']["s3_file_path"],
                                            timestamp)
                # print "2b"
                ftps_to_remove.add(data['ftp']._id)

    pool.close()
    pool.terminate()
    # print 3
    more_ftps_to_remove, number_bad_files = upload_binified_data(
        all_binified_data, error_handler, survey_id_dict)
    # print "X"
    ftps_to_remove.update(more_ftps_to_remove)
    for ftp_id in ftps_to_remove:
        FileToProcess(ftp_id).remove()
    # print "Y"
    gc.collect()
    # print "Z"
    return number_bad_files
def create_fake_mp4(number=10):
    for x in range(number):
        with open("thing", "r") as f:
            file_path = "55d3826297013e3a1c9b8c3e/h6fflp/voiceRecording/%s.mp4" % (1000000000 + x)
            s3_upload(file_path, f.read(), ObjectId("55d3826297013e3a1c9b8c3e"), raw_path=True)
            FileToProcess.append_file_for_processing(file_path, ObjectId("55d3826297013e3a1c9b8c3e"), "h6fflp")