Python get_project_for_subject 예제들, misc_utils.get_project_for_subject Python 예제들

예제 #1

0

파일 보기

파일: load_files.py 프로젝트: azamkhan99/honours_project

def load_static_airspeck_file(sid_or_uuid,
                              project_name=None,
                              sensor_label=None,
                              suffix_filename="",
                              upload_type='automatic',
                              subject_visit_number=None,
                              calibrate_pm=False,
                              calibrate_ox=False,
                              calibrate_no2=False,
                              use_all_features_for_pm_calibration=False,
                              use_all_features_for_gas_calibration=True,
                              return_calibration_flag=False,
                              calibration_id=None,
                              filename=None,
                              country_name=None):
    assert upload_type in [
        'automatic', 'sd_card'
    ], "upload_type has to be either 'automatic' or 'sd_card'"

    if project_name is None and len(sid_or_uuid) == 6:
        project_name = get_project_for_subject(sid_or_uuid)

    if sensor_label is None:
        if subject_visit_number is None:
            sensor_label = "{}".format(sid_or_uuid)
        else:
            sensor_label = "{}({})".format(sid_or_uuid, subject_visit_number)

    if filename is None:
        filename = "{}_static_airspeck_{}{}.csv".format(
            sensor_label, upload_type, suffix_filename)

    print("Loading file: {}".format(project_mapping[project_name][2] +
                                    filename))
    data = load_airrespeck_file(project_mapping[project_name][2] + filename,
                                project_name)

    #Sdata = filter_out_outliers_gas(data)

    if calibrate_pm or calibrate_ox or calibrate_no2:
        result_date, was_calibrated_pm, was_calibrated_no2, was_calibrated_ox, data = calibrate_airspeck(
            sid_or_uuid,
            data,
            calibrate_pm=calibrate_pm,
            calibrate_no2=calibrate_no2,
            calibrate_ox=calibrate_ox,
            project_name=project_name,
            calibration_id=calibration_id,
            use_all_features_pm=use_all_features_for_pm_calibration,
            use_all_features_gas=use_all_features_for_gas_calibration,
            country_name=country_name)

        if return_calibration_flag:
            return result_date, was_calibrated_pm, was_calibrated_no2, was_calibrated_ox, data

    return data

예제 #2

0

파일 보기

파일: load_files.py 프로젝트: azamkhan99/honours_project

def load_respeck_file(subject_id,
                      project_name=None,
                      filter_out_not_worn=True,
                      subject_visit_number=None,
                      upload_type='automatic',
                      suffix_filename="",
                      raw_file=False):
    '''
    Load a Respeck csv file to a pandas dataframe in the correct timezone
    :param subject_id: 6-character subject ID
    :param project_name: For some projects, this is the actual name "daphne", for others, it's the project ID.
    See constants.py for a list of all project names
    :param filter_out_not_worn: Whether to filter out those periods where the Respeck was most likely not worn. These
    are the periods where the activity level is below a threshold for some time.
    :param subject_visit_number: Which of several recordings of a subject should be loaded.
    Only relevant for some projects like Daphne
    :return: Respeck data as pandas dataframe.
    '''

    if subject_visit_number is None:
        label_files = "{}".format(subject_id)
    else:
        label_files = "{}({})".format(subject_id, int(subject_visit_number))

    if project_name is None:
        project_name = get_project_for_subject(subject_id)

    if raw_file:
        filename = "{}_respeck_{}_raw{}.csv".format(label_files, upload_type,
                                                    suffix_filename)
    else:
        filename = "{}_respeck_{}{}.csv".format(label_files, upload_type,
                                                suffix_filename)

    print("Loading file: {}".format(project_mapping[project_name][2] +
                                    filename))
    respeck_data = load_airrespeck_file(
        project_mapping[project_name][2] + filename, project_name)

    if respeck_data is not None and filter_out_not_worn and len(
            respeck_data) > 0:
        set_breathing_rate_nan_when_lying_on_stomach(respeck_data)
        set_breathing_rate_nan_when_not_worn(respeck_data)

    return respeck_data

예제 #3

0

파일 보기

def download_respeck_data(subject_id,
                          upload_type='automatic',
                          is_minute_averaged=True,
                          timeframe=None,
                          overwrite_if_already_exists=False,
                          subject_visit_number=None,
                          suffix_filename="",
                          filename=None,
                          project_name=None,
                          out_directory=None):
    assert upload_type in [
        'automatic', 'manual'
    ], "Upload type has to be either automatic or manual"

    assert is_minute_averaged or upload_type is not 'automatic', \
        "Only minute averaged data is automatically uploaded. Set is_minute_average=False."

    if project_name is None:
        project_name = get_project_for_subject(subject_id)

    if out_directory is None:
        out_directory = project_mapping[project_name][2]

    if subject_visit_number is None:
        label_files = "{}".format(subject_id)
    else:
        label_files = "{}({})".format(subject_id, int(subject_visit_number))

    if filename is None:
        if is_minute_averaged:
            filename = "{}_respeck_{}{}.csv".format(label_files, upload_type,
                                                    suffix_filename)
        else:
            filename = "{}_respeck_{}_raw{}.csv".format(
                label_files, upload_type, suffix_filename)

    if timeframe is None:
        # Set to a timeframe which will definitely include all data
        timeframe = [datetime(2016, 1, 1), datetime(2100, 1, 1)]

    if os.path.isfile(out_directory + "/" +
                      filename) and not overwrite_if_already_exists:
        print("Data already downloaded")
        return

    if upload_type == 'automatic':
        download_respeck_minute_from_datastore(subject_id,
                                               out_filepath=out_directory +
                                               filename,
                                               timeframe=timeframe,
                                               project_name=project_name,
                                               upload_type='automatic')
    elif upload_type == 'manual':
        if is_minute_averaged:
            download_respeck_minute_from_datastore(subject_id,
                                                   out_filepath=out_directory +
                                                   filename,
                                                   timeframe=timeframe,
                                                   project_name=project_name,
                                                   upload_type='manual')
        else:
            download_raw_respeck_from_google_storage(
                subject_id,
                out_directory=out_directory,
                out_filename=filename,
                timeframe=timeframe,
                project_name=project_name,
                overwrite_file_if_existing=overwrite_if_already_exists,
                subject_visit_number=subject_visit_number)
    print('Done')

예제 #4

0

파일 보기

def download_from_google_storage(subject_id,
                                 prefix_storage_filename,
                                 timestamp_label,
                                 out_filename,
                                 out_directory=None,
                                 project_name=None,
                                 timeframe=None,
                                 force_download=False,
                                 store_raw=False):
    if project_name is None:
        project_name = get_project_for_subject(subject_id)

    if out_directory is None:
        out_directory = project_mapping[project_name][2]

    if os.path.isfile(out_directory + out_filename) and not force_download:
        print("Data already downloaded")
        return

    # Did user pass timeframe? If not, load all data
    if timeframe is None:
        timeframe = [datetime(2016, 1, 1), datetime(2100, 1, 1)]

    # Select the timeframe, after accounting for timezone difference
    tz = timezone(project_mapping[project_name][1])

    if timeframe[0].tzinfo is None:
        localised_start = tz.localize(timeframe[0])
        localised_end = tz.localize(timeframe[1])
    else:
        localised_start = timeframe[0]
        localised_end = timeframe[1]

    data = pd.DataFrame()
    storage_client = storage.Client('specknet-pyramid-test')
    bucket = storage_client.get_bucket(project_mapping[project_name][0])

    for blob in bucket.list_blobs(prefix='AirRespeck/{}'.format(subject_id)):
        filename = blob.name.split("/")[-1]
        if subject_id in filename and prefix_storage_filename in filename:
            if timeframe is not None:
                date_of_file = tz.localize(
                    datetime.strptime(filename[-14:-4], "%Y-%m-%d"))
                # Skip file if it's not in the timeframe we're interested in!
                if date_of_file < localised_start.replace(hour=0, minute=0, second=0) or \
                        date_of_file > localised_end:
                    continue

            temp_file = out_directory + "temp/" + filename

            # Create temp directory if it doesn't exist yet
            if not os.path.exists(out_directory + "temp"):
                os.makedirs(out_directory + "temp")

            if not os.path.isfile(temp_file):
                blob.download_to_filename(temp_file)

            # If data is encrypted, overwrite with decrypted version
            with open(temp_file) as file:
                if file.readline().strip() == "Encrypted":
                    # Decrypt file before continuing
                    print("File is being decrypted")
                    decrypt_file(temp_file, temp_file)
                else:
                    # Try converting all dates. If this failes, some lines are probably encrypted
                    temp_data = pd.read_csv(temp_file, error_bad_lines=False)
                    try:
                        pd.to_datetime(temp_data[timestamp_label],
                                       unit='ms',
                                       exact=False)
                    except:
                        partly_decrypt_file(temp_file, temp_file)

            data = data.append(pd.read_csv(temp_file, error_bad_lines=False))

    if len(data) > 0:
        data[timestamp_label] = pd.to_datetime(data[timestamp_label],
                                               unit='ms',
                                               exact=False)

        # Calculate minute averages
        if prefix_storage_filename in ["Airspeck", "GPSPhone"
                                       ] and not store_raw:
            data = data.groupby(data[timestamp_label].apply(
                lambda d: d.replace(second=0, microsecond=0))).mean()
        else:
            # Don't minute average here, but simply set the timestamp column as index
            data = data.set_index(data[timestamp_label]).sort_index()
            # Delete the original column
            data = data.drop(timestamp_label, axis=1)

        # Re-insert (copy) timestamp column from index, so that it is saved
        if 'timestamp' not in data.columns:
            data.insert(0, 'timestamp', data.index)

        # Remove NaTs from index
        data = data.loc[data.index.notnull()]

        # If we are downloading Respeck data, remove Respeck timestamps and sequence number
        if prefix_storage_filename == "RESpeck":
            data = data.drop(['respeckTimestamp', 'sequenceNumber'], axis=1)

        data = data[localised_start.astimezone(timezone('UTC')).replace(
            tzinfo=None):localised_end.astimezone(timezone('UTC')).replace(
                tzinfo=None)]

        data.to_csv(out_directory + "/" + out_filename, index=False)

예제 #5

0

파일 보기

def download_static_airspeck(subj_or_uuid,
                             sensor_label=None,
                             project_name=None,
                             overwrite_if_already_exists=False,
                             timeframe=None,
                             upload_type='automatic',
                             suffix_filename="",
                             filename=None,
                             subject_visit_number=None,
                             out_directory=None):
    assert upload_type in [
        'automatic', 'sd_card'
    ], "upload_type has to be either 'automatic' or 'sd_card'"

    if project_name is None:
        if len(subj_or_uuid) == 6:
            project_name = get_project_for_subject(subj_or_uuid)
        else:
            raise ValueError(
                "When passing a UUID and not a subject ID, also specify a project_name so that the "
                "correct directory can be selected")

    if out_directory is None:
        out_directory = project_mapping[project_name][2]

    if sensor_label is None:
        if len(subj_or_uuid) == 6 and subject_visit_number is not None:
            sensor_label = "{}({})".format(subj_or_uuid, subject_visit_number)
        else:
            sensor_label = subj_or_uuid

    if filename is None:
        filename = "{}_static_airspeck_{}{}.csv".format(
            sensor_label, upload_type, suffix_filename)

    out_filepath = out_directory + filename

    if not overwrite_if_already_exists and os.path.isfile(out_filepath):
        print('Skipping file as it already exists')
        return

    client = get_datastore_client()

    with open(out_filepath, "w") as out:

        out.write(
            "timestamp,pm1,pm2_5,pm10,bin0,bin1,bin2,bin3,bin4,bin5,bin6,bin7,bin8,bin9,bin10,bin11,bin12,"
            "bin13,bin14,bin15,temperature,humidity,battery,no2_ae,no2_we,ox_ae,ox_we,"
            "gpsLatitude,gpsLongitude\n")

        # Did user pass timeframe? If not, load all data
        if timeframe is None:
            timeframe = [datetime(2016, 1, 1), datetime(2100, 1, 1)]

        tz = timezone(project_mapping[project_name][1])

        if timeframe[0].tzinfo is None:
            utc_start = tz.localize(timeframe[0]).astimezone(
                timezone('UTC')).replace(tzinfo=None)
            utc_end = tz.localize(timeframe[1]).astimezone(
                timezone('UTC')).replace(tzinfo=None)
        else:
            utc_start = timeframe[0]
            utc_end = timeframe[1]

        if upload_type == 'automatic':
            kind_name = 'StaticAirspeck'
            if len(subj_or_uuid) == 16:
                id_name = 'uuid'
            else:
                id_name = 'subject_id'
        else:
            kind_name = 'StaticAirspeckSDCard'
            if len(subj_or_uuid) == 16:
                id_name = 'airspeck_uuid'
            else:
                id_name = 'subject_id'

        query = client.query(kind=kind_name,
                             filters=[(id_name, '=', subj_or_uuid),
                                      ('timestamp', '>=', utc_start),
                                      ('timestamp', '<', utc_end)],
                             order=['timestamp']).fetch()

        for e in query:
            out.write("{},{},{},{},".format(
                e['timestamp'].replace(tzinfo=None), e['pm1'], e['pm2_5'],
                e['pm10']))
            for i in range(0, 16):
                out.write("{},".format(e['bins'][i]))
            if upload_type == 'automatic':
                out.write("{},{},{},{},{},{},{},{},{}\n".format(
                    e['temperature'], e['humidity'], e['battery'], e['no2_ae'],
                    e['no2_we'], e['ox_ae'], e['ox_we'],
                    e['location']['latitude'], e['location']['longitude']))
            else:
                out.write("{},{},{},{},{},{},{},{},{}\n".format(
                    e['temperature'], e['humidity'], e['battery'], e['no2_ae'],
                    e['no2_we'], e['ox_ae'], e['ox_we'], e['latitude'],
                    e['longitude']))

    print('Done')

예제 #6

0

파일 보기

파일: load_files.py 프로젝트: azamkhan99/honours_project

def load_personal_airspeck_file(subject_id,
                                project_name=None,
                                upload_type='automatic',
                                is_minute_averaged=True,
                                subject_visit_number=None,
                                suffix_filename="",
                                calibrate_pm_and_gas=False,
                                use_all_features_for_pm_calibration=False,
                                use_all_features_for_gas_calibration=False,
                                suppress_output=False,
                                set_below_zero_to=np.nan,
                                return_calibration_flag=False,
                                calibration_id=None,
                                filter_pm=True,
                                country_name=None):
    '''
    Load an Airspeck personal csv file to a pandas dataframe in the correct timezone
    :param subject_id: 6-character subject ID
    :param project_name: For some projects, this is the actual name "daphne", for others, it's the project ID.
    See constants.py for a list of all project names
    :param filename: the filename to load. If None, load default filename "[Subject ID]_airspeck_personal.csv"
    :param is_minute_averaged: If the raw file was downloaded instead of minute averages. This only affects the default filename.
    :param subject_visit_number: Which of several recordings of a subject should be loaded.
    Only relevant for some projects like Daphne
    :param calibrate_pm_and_gas: Whether to calibrated the PM2.5 data, if calibration factors are available.
    :param use_all_features_for_pm_calibration: Whether to only use the uncalibrated PM2.5 data (recommended), or all
     features, including the bin counts and temperature/humidity. The latter often looks better during the actual
      calibration, but gives worse results later.
    :param suppress_output: Whether to print out if bad values were filtered, i.e. set to zero.
    :param set_below_zero_to: Set values below zero to a desired value (default np.nan)
    :param return_calibration_flag: Instead of just returning the dataframe, prepend whether the data was calibrated:
    return is_calibrated, data. This is useful to see if data from a subject loaded from disk was calibrated.
    :return:
    '''
    if subject_visit_number is None:
        label_files = subject_id
    else:
        label_files = "{}({:.0f})".format(subject_id,
                                          int(subject_visit_number))

    if project_name is None:
        project_name = get_project_for_subject(subject_id)

    if is_minute_averaged:
        filename = "{}_airspeck_personal_{}{}.csv".format(
            label_files, upload_type, suffix_filename)
    else:
        filename = "{}_airspeck_personal_{}_raw{}.csv".format(
            label_files, upload_type, suffix_filename)

    print("Loading file: {}".format(project_mapping[project_name][2] +
                                    filename))
    data = load_airrespeck_file(project_mapping[project_name][2] + filename,
                                project_name)

    if calibrate_pm_and_gas:
        result_date, was_calibrated_pm, was_calibrated_no2, was_calibrated_ox, data = calibrate_airspeck(
            subject_id,
            data,
            project_name=project_name,
            calibrate_pm=True,
            calibrate_no2=False,
            calibrate_ox=False,
            calibration_id=calibration_id,
            use_all_features_pm=use_all_features_for_pm_calibration,
            use_all_features_gas=use_all_features_for_gas_calibration,
            country_name=country_name)

    if filter_pm and data is not None and len(data) > 0:
        below_zero_mask = data['pm2_5'] <= 0

        if np.count_nonzero(below_zero_mask):
            if not suppress_output:
                print("Setting {} values equal to or below 0 to {}".format(
                    np.count_nonzero(below_zero_mask), set_below_zero_to))
            data.loc[below_zero_mask, 'pm2_5'] = set_below_zero_to

        # Fix humidity values. Sometimes valid readings of humidity pass 100. Above 105, they are definitely invalid
        data.loc[data['humidity'] > 105, 'humidity'] = np.nan

    if calibrate_pm_and_gas and return_calibration_flag:
        return result_date, was_calibrated_pm, data
    else:
        return data

예제 #7

0

파일 보기