예제 #1
0
    def merge_left(self,
                   left_on,
                   right_on,
                   right_fields=tuple(),
                   right_writers=None):
        l_key_raw = val.raw_array_from_parameter(self, 'left_on', left_on)
        l_index = np.arange(len(l_key_raw), dtype=np.int64)
        l_df = pd.DataFrame({'l_k': l_key_raw, 'l_index': l_index})

        r_key_raw = val.raw_array_from_parameter(self, 'right_on', right_on)
        r_index = np.arange(len(r_key_raw), dtype=np.int64)
        r_df = pd.DataFrame({'r_k': r_key_raw, 'r_index': r_index})

        df = pd.merge(left=l_df,
                      right=r_df,
                      left_on='l_k',
                      right_on='r_k',
                      how='left')
        r_to_l_map = df['r_index'].to_numpy(dtype=np.int64)
        r_to_l_filt = np.logical_not(df['r_index'].isnull()).to_numpy()

        right_results = list()
        for irf, rf in enumerate(right_fields):
            rf_raw = val.raw_array_from_parameter(
                self, 'right_fields[{}]'.format(irf), rf)
            joined_field = ops.safe_map(rf_raw, r_to_l_map, r_to_l_filt)
            # joined_field = per._safe_map(rf_raw, r_to_l_map, r_to_l_filt)
            if right_writers is None:
                right_results.append(joined_field)
            else:
                right_writers[irf].data.write(joined_field)

        return right_results
예제 #2
0
def validate_temperature_v1(session, min_temp, max_temp, temps, temp_units,
                            temp_set, dest_temps, dest_temps_valid,
                            dest_temps_modified):
    """
    Check the temperature field and convert fahrenheit to celsius.

    :param session: The Exetera session instance.
    :param min_temp: The minimal numeric value for temperature.
    :param max_temp: The maximum numeric value for temperature.
    :param temps: The 'temperature' column from assessments dataframe.
    :param temp_units: The 'temperature_unit' column from assessments dataframe.
    :param temp_set: A field marking if the temperature field is set.
    :param dest_temps: A destination field to write the temperature values to.
    :param dest_temps_valid: A destination field to indicates if the temperature is valid, e.g. in between minimum and
        maximum numeric values.
    :param dest_temps_modified: A destination field to indicates if the temperature has been modified here.
    """
    raw_temps = val.raw_array_from_parameter(session, "temps", temps)
    raw_temp_set = val.raw_array_from_parameter(session, "temp_set", temp_set)
    raw_dest_temps = np.where(raw_temps > max_temp, (raw_temps - 32) / 1.8,
                              raw_temps)
    raw_dest_temps_valid = raw_temp_set & (min_temp <= raw_dest_temps) & (
        raw_dest_temps <= max_temp)
    raw_dest_temps_modified = raw_temps != raw_dest_temps
    dest_temps.data.write(raw_dest_temps)
    dest_temps_valid.data.write(raw_dest_temps_valid)
    dest_temps_modified.data.write(raw_dest_temps_modified)
예제 #3
0
    def merge_right(self,
                    left_on,
                    right_on,
                    left_fields=None,
                    left_writers=None):
        l_key_raw = val.raw_array_from_parameter(self, 'left_on', left_on)
        l_index = np.arange(len(l_key_raw), dtype=np.int64)
        l_df = pd.DataFrame({'l_k': l_key_raw, 'l_index': l_index})

        r_key_raw = val.raw_array_from_parameter(self, 'right_on', right_on)
        r_index = np.arange(len(r_key_raw), dtype=np.int64)
        r_df = pd.DataFrame({'r_k': r_key_raw, 'r_index': r_index})

        df = pd.merge(left=r_df,
                      right=l_df,
                      left_on='r_k',
                      right_on='l_k',
                      how='left')
        l_to_r_map = df['l_index'].to_numpy(dtype='int64')
        l_to_r_filt = np.logical_not(df['l_index'].isnull()).to_numpy()

        left_results = list()
        for ilf, lf in enumerate(left_fields):
            lf_raw = val.raw_array_from_parameter(
                self, 'left_fields[{}]'.format(ilf), lf)
            joined_field = ops.safe_map(lf_raw, l_to_r_map, l_to_r_filt)
            if left_writers is None:
                left_results.append(joined_field)
            else:
                left_writers[ilf].data.write(joined_field)

        return left_results
예제 #4
0
    def dataset_sort_index(self, sort_indices, index=None):
        """
        Generate a sorted index based on a set of fields upon which to sort and an optional
        index to apply to the sort_indices
        :param sort_indices: a tuple or list of indices that determine the sorted order
        :param index: optional - the index by which the initial field should be permuted
        :return: the resulting index that can be used to permute unsorted fields
        """
        val._check_all_readers_valid_and_same_type(sort_indices)
        r_readers = tuple(reversed(sort_indices))

        raw_data = val.raw_array_from_parameter(self, 'readers', r_readers[0])

        if index is None:
            raw_index = np.arange(len(raw_data))
        else:
            raw_index = val.raw_array_from_parameter(self, 'index', index)

        acc_index = raw_index
        fdata = raw_data[acc_index]
        index = np.argsort(fdata, kind='stable')
        acc_index = acc_index[index]

        for r in r_readers[1:]:
            raw_data = val.raw_array_from_parameter(self, 'readers', r)
            fdata = raw_data[acc_index]
            index = np.argsort(fdata, kind='stable')
            acc_index = acc_index[index]

        return acc_index
예제 #5
0
    def get_index(self, target, foreign_key, destination=None):
        print('  building patient_id index')
        t0 = time.time()
        target_lookup = dict()
        target_ = val.raw_array_from_parameter(self, "target", target)
        for i, v in enumerate(target_):
            target_lookup[v] = i
        print(f'  target lookup built in {time.time() - t0}s')

        print('  perform initial index')
        t0 = time.time()
        foreign_key_elems = val.raw_array_from_parameter(
            self, "foreign_key", foreign_key)
        # foreign_key_index = np.asarray([target_lookup.get(i, -1) for i in foreign_key_elems],
        #                                    dtype=np.int64)
        foreign_key_index = np.zeros(len(foreign_key_elems), dtype=np.int64)

        current_invalid = np.int64(operations.INVALID_INDEX)
        for i_k, k in enumerate(foreign_key_elems):
            index = target_lookup.get(k, current_invalid)
            if index >= operations.INVALID_INDEX:
                current_invalid += 1
                target_lookup[k] = index
            foreign_key_index[i_k] = index
        print(f'  initial index performed in {time.time() - t0}s')

        if destination is not None:
            if val.is_field_parameter(destination):
                destination.data.write(foreign_key_index)
            else:
                destination[:] = foreign_key_index
        else:
            return foreign_key_index
예제 #6
0
    def join(self,
             destination_pkey,
             fkey_indices,
             values_to_join,
             writer=None,
             fkey_index_spans=None):

        if isinstance(destination_pkey, fld.IndexedStringField):
            raise ValueError(
                "'destination_pkey' must not be an indexed string field")
        if isinstance(fkey_indices, fld.IndexedStringField):
            raise ValueError(
                "'fkey_indices' must not be an indexed string field")
        if isinstance(values_to_join, rw.IndexedStringReader):
            raise ValueError(
                "Joins on indexed string fields are not supported")

        raw_fkey_indices = val.raw_array_from_parameter(
            self, "fkey_indices", fkey_indices)

        raw_values_to_join = val.raw_array_from_parameter(
            self, "values_to_join", values_to_join)

        # generate spans for the sorted key indices if not provided
        if fkey_index_spans is None:
            fkey_index_spans = self.get_spans(field=raw_fkey_indices)

        # select the foreign keys from the start of each span to get an ordered list
        # of unique id indices in the destination space that the results of the predicate
        # execution are mapped to
        unique_fkey_indices = raw_fkey_indices[fkey_index_spans[:-1]]

        # generate a filter to remove invalid foreign key indices (where values in the
        # foreign key don't map to any values in the destination space
        invalid_filter = unique_fkey_indices < operations.INVALID_INDEX
        safe_unique_fkey_indices = unique_fkey_indices[invalid_filter]

        # the predicate results are in the same space as the unique_fkey_indices, which
        # means they may still contain invalid indices, so filter those now
        safe_values_to_join = raw_values_to_join[invalid_filter]

        # now get the memory that the results will be mapped to
        #destination_space_values = writer.chunk_factory(len(destination_pkey))
        destination_space_values = np.zeros(len(destination_pkey),
                                            dtype=raw_values_to_join.dtype)

        # finally, map the results from the source space to the destination space
        destination_space_values[
            safe_unique_fkey_indices] = safe_values_to_join

        if writer is not None:
            writer.data.write(destination_space_values)
        else:
            return destination_space_values
예제 #7
0
    def get_shared_index(self, keys):
        """
        Create a shared index based on a tuple numpy arrays containing keys.
        This function generates the sorted union of a tuple of key fields and
        then maps the individual arrays to their corresponding indices in the
        sorted union.

        Example:
            key_1 = ['a', 'b', 'e', 'g', 'i']
            key_2 = ['b', 'b', 'c', 'c, 'e', 'g', 'j']
            key_3 = ['a', 'c' 'd', 'e', 'g', 'h', 'h', 'i']

            sorted_union = ['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j']

            key_1_index = [0, 1, 4, 5, 7]
            key_2_index = [1, 1, 2, 2, 4, 5, 8]
            key_3_index = [0, 2, 3, 4, 5, 6, 6, 7]

            :param keys: a tuple of groups, fields or ndarrays whose contents represent keys
        """
        if not isinstance(keys, tuple):
            raise ValueError("'keys' must be a tuple")
        concatted = None
        raw_keys = list()
        for k in keys:
            raw_field = val.raw_array_from_parameter(self, 'keys', k)
            raw_keys.append(raw_field)
            if concatted is None:
                concatted = pd.unique(raw_field)
            else:
                concatted = np.concatenate((concatted, raw_field), axis=0)
        concatted = pd.unique(concatted)
        concatted = np.sort(concatted)

        return tuple(np.searchsorted(concatted, k) for k in raw_keys)
예제 #8
0
    def _aggregate_impl(self, predicate, index, src=None, dest=None):
        index_ = val.raw_array_from_parameter(self, "index", index)

        dest_field = None
        if dest is not None:
            dest_field = val.field_from_parameter(self, "dest", dest)

        fkey_index_spans = self.get_spans(field=index)

        # execute the predicate (note that not every predicate requires a reader)
        results = predicate(fkey_index_spans, src, dest_field)

        return dest if dest is not None else results
예제 #9
0
    def get_shared_index(self, keys):
        if not isinstance(keys, tuple):
            raise ValueError("'keys' must be a tuple")
        concatted = None
        for k in keys:
            raw_field = val.raw_array_from_parameter(self, 'keys', k)
            if concatted is None:
                concatted = pd.unique(raw_field)
            else:
                concatted = np.concatenate((concatted, raw_field), axis=0)
        concatted = pd.unique(concatted)
        concatted = np.sort(concatted)

        return tuple(np.searchsorted(concatted, k) for k in keys)
예제 #10
0
    def distinct(self, field=None, fields=None, filter=None):
        if field is None and fields is None:
            return ValueError("One of 'field' and 'fields' must be set")
        if field is not None and fields is not None:
            return ValueError("Only one of 'field' and 'fields' may be set")

        if field is not None:
            field_ = val.raw_array_from_parameter(self, 'field', field)
            return np.unique(field_)

        entries = [(f'{i}', f.dtype) for i, f in enumerate(fields)]
        unified = np.empty_like(fields[0], dtype=np.dtype(entries))
        for i, f in enumerate(fields):
            unified[f'{i}'] = f

        uniques = np.unique(unified)
        results = [uniques[f'{i}'] for i in range(len(fields))]
        return results
예제 #11
0
 def index_spans(self, spans):
     raw_spans = val.raw_array_from_parameter(self, "spans", spans)
     results = np.zeros(raw_spans[-1], dtype=np.int64)
     return _index_spans(raw_spans, results)
def test_type_from_mechanism_v1(datastore, mechanism, mechanism_free,
                                pcr_standard_answers, pcr_strong_inferred, pcr_weak_inferred,
                                antibody_standard_answers, antibody_strong_inferred, antibody_weak_inferred):
    """
    Classify the test mechanism by using data from the 'mechanism' field and user filled free text.

    :param datastore: The Exetera session instance.
    :param mechanism: The 'mechanism' column from the tests dataframe.
    :param mechanism_free: The 'mechanism_free' column from the tests dataframe.
    :param pcr_standard_answers: The field to indicate a standard pcr test performed.
    :param pcr_strong_inferred: The field to indicate a strong pcr test performed.
    :param pcr_weak_inferred: The field to indicate a weak pcr test performed.
    :param antibody_standard_answers: The field to indicate a standard antibody test performed.
    :param antibody_strong_inferred: The field to indicate a strong antibody test performed.
    :param antibody_weak_inferred: The field to indicate a weak antibody test performed.
    """

    def search_for_substring(text_entries, pattern):
        filt = np.zeros(len(text_entries), np.bool)
        for ie, e in enumerate(text_entries):
            if pattern in e.lower():
                filt[ie] = True
        return filt

    antigen_exclusions = ('nose_throat_swab', 'throat_swab', 'nose_swab', 'spit_tube')
    antibody_exclusions = ('blood_sample', 'blood_sample_finger_prick', 'blood_sample_needle_draw')

    antibody_strong = ('blood', 'antib', 'anti b', 'anti-b', 'prick', 'antikro', 'anti kro', 'blod', 'all three', 'all tests', 'all of')
    antibody_weak = ('prick', 'stick i f', 'finger')
    pcr_strong = ('swab', 'swap', 'swob', 'swan', 'tonsil', 'nose', 'throat', 'näsa', 'svalg', 'oral', 'nasoph',
                      'saliva', 'all three', 'all tests', 'all of', 'plasma', 'drive t', 'drivet')
    pcr_weak = ('self test', 'self admin', 'home test', 'home admin', 'self', 'home', 'post', 'i did it', 'drive', 'hemma', 'private')


    r_mechanism = val.raw_array_from_parameter(datastore, 'mechanism', mechanism)

    f_pcr_cat = np.isin(r_mechanism, (1, 2, 3, 4))
    if isinstance(pcr_standard_answers, np.ndarray):
        pcr_standard_answers[:] = f_pcr_cat
    else:
        pcr_standard_answers.data.write(f_pcr_cat)

    f_atb_cat = np.isin(r_mechanism, (5, 6, 7))
    if isinstance(antibody_standard_answers, np.ndarray):
        antibody_standard_answers[:] = f_atb_cat
    else:
        antibody_standard_answers.data.write(f_atb_cat)

    r_mechanism_free = val.raw_array_from_parameter(datastore, 'mechanism_free', mechanism_free)

    f_pcr_strong = np.zeros(len(r_mechanism), dtype=np.bool)
    for p in pcr_strong:
        filt = search_for_substring(r_mechanism_free, p)
        f_pcr_strong = f_pcr_strong | filt
    if isinstance(pcr_strong_inferred, np.ndarray):
        pcr_strong_inferred[:] = f_pcr_strong
    else:
        pcr_strong_inferred.data.write(f_pcr_strong)

    f_pcr_weak = np.zeros(len(r_mechanism), dtype=np.bool)
    for p in pcr_weak:
        filt = search_for_substring(r_mechanism_free, p)
        f_pcr_weak = f_pcr_weak | filt
    if isinstance(pcr_weak_inferred, np.ndarray):
        pcr_weak_inferred[:] = f_pcr_weak
    else:
        pcr_weak_inferred.data.write(f_pcr_weak)

    f_antibody_strong = np.zeros(len(r_mechanism), dtype=np.bool)
    for p in antibody_strong:
        filt = search_for_substring(r_mechanism_free, p)
        f_antibody_strong = f_antibody_strong | filt
    if isinstance(antibody_strong_inferred, np.ndarray):
        antibody_strong_inferred[:] = f_antibody_strong
    else:
        antibody_strong_inferred.data.write(f_antibody_strong)

    f_antibody_weak = np.zeros(len(r_mechanism), dtype=np.bool)
    for p in antibody_weak:
        filt = search_for_substring(r_mechanism_free, p)
        f_antibody_weak = f_antibody_weak | filt
    if isinstance(antibody_weak_inferred, np.ndarray):
        antibody_weak_inferred[:] = f_antibody_weak
    else:
        antibody_weak_inferred.data.write(f_antibody_weak)
def test_type_from_mechanism_v1_standard_input(s, test_df):
    mechanism = test_df['mechanism']
    mechanism_free = test_df['mechanism_freetext']
    pcr_standard_answers = test_df.create_numeric('pcr_standard_answers', 'bool').data
    pcr_strong_inferred = test_df.create_numeric('pcr_strong_inferred', 'bool').data
    pcr_weak_inferred = test_df.create_numeric('pcr_weak_inferred', 'bool').data
    antibody_standard_answers = test_df.create_numeric('antibody_standard_answers', 'bool').data
    antibody_strong_inferred = test_df.create_numeric('antibody_strong_inferred', 'bool').data
    antibody_weak_inferred = test_df.create_numeric('antibody_weak_inferred', 'bool').data

    def search_for_substring(text_entries, pattern):
        filt = np.zeros(len(text_entries), np.bool)
        for ie, e in enumerate(text_entries):
            if pattern in e.lower():
                filt[ie] = True
        return filt

    antigen_exclusions = ('nose_throat_swab', 'throat_swab', 'nose_swab', 'spit_tube')
    antibody_exclusions = ('blood_sample', 'blood_sample_finger_prick', 'blood_sample_needle_draw')

    antibody_strong = ('blood', 'antib', 'anti b', 'anti-b', 'prick', 'antikro', 'anti kro', 'blod', 'all three', 'all tests', 'all of')
    antibody_weak = ('prick', 'stick i f', 'finger')
    pcr_strong = ('swab', 'swap', 'swob', 'swan', 'tonsil', 'nose', 'throat', 'näsa', 'svalg', 'oral', 'nasoph',
                      'saliva', 'all three', 'all tests', 'all of', 'plasma', 'drive t', 'drivet')
    pcr_weak = ('self test', 'self admin', 'home test', 'home admin', 'self', 'home', 'post', 'i did it', 'drive', 'hemma', 'private')


    r_mechanism = val.raw_array_from_parameter(s, 'mechanism', mechanism)

    f_pcr_cat = np.isin(r_mechanism, (1, 2, 3, 4))
    if isinstance(pcr_standard_answers, np.ndarray):
        pcr_standard_answers[:] = f_pcr_cat
    else:
        pcr_standard_answers.write(f_pcr_cat)

    f_atb_cat = np.isin(r_mechanism, (5, 6, 7))
    if isinstance(antibody_standard_answers, np.ndarray):
        antibody_standard_answers[:] = f_atb_cat
    else:
        antibody_standard_answers.write(f_atb_cat)
    # todo problem w/ changing ds to s, return what for indexing string field
    r_mechanism_free = val.raw_array_from_parameter(s, 'mechanism_free', mechanism_free)

    f_pcr_strong = np.zeros(len(r_mechanism), dtype=np.bool)
    for p in pcr_strong:
        filt = search_for_substring(r_mechanism_free, p)
        f_pcr_strong = f_pcr_strong | filt
    if isinstance(pcr_strong_inferred, np.ndarray):
        pcr_strong_inferred[:] = f_pcr_strong
    else:
        pcr_strong_inferred.write(f_pcr_strong)

    f_pcr_weak = np.zeros(len(r_mechanism), dtype=np.bool)
    for p in pcr_weak:
        filt = search_for_substring(r_mechanism_free, p)
        f_pcr_weak = f_pcr_weak | filt
    if isinstance(pcr_weak_inferred, np.ndarray):
        pcr_weak_inferred[:] = f_pcr_weak
    else:
        pcr_weak_inferred.write(f_pcr_weak)

    f_antibody_strong = np.zeros(len(r_mechanism), dtype=np.bool)
    for p in antibody_strong:
        filt = search_for_substring(r_mechanism_free, p)
        f_antibody_strong = f_antibody_strong | filt
    if isinstance(antibody_strong_inferred, np.ndarray):
        antibody_strong_inferred[:] = f_antibody_strong
    else:
        antibody_strong_inferred.write(f_antibody_strong)

    f_antibody_weak = np.zeros(len(r_mechanism), dtype=np.bool)
    for p in antibody_weak:
        filt = search_for_substring(r_mechanism_free, p)
        f_antibody_weak = f_antibody_weak | filt
    if isinstance(antibody_weak_inferred, np.ndarray):
        antibody_weak_inferred[:] = f_antibody_weak
    else:
        antibody_weak_inferred.write(f_antibody_weak)

    # count_in_exclusion = 0
    # for r in r_mechanism_free:
    #     if r.lower in antigen_exclusions:
    #         count_in_exclusion += 1
    # print(count_in_exclusion)