def merge_left(self, left_on, right_on, right_fields=tuple(), right_writers=None): l_key_raw = val.raw_array_from_parameter(self, 'left_on', left_on) l_index = np.arange(len(l_key_raw), dtype=np.int64) l_df = pd.DataFrame({'l_k': l_key_raw, 'l_index': l_index}) r_key_raw = val.raw_array_from_parameter(self, 'right_on', right_on) r_index = np.arange(len(r_key_raw), dtype=np.int64) r_df = pd.DataFrame({'r_k': r_key_raw, 'r_index': r_index}) df = pd.merge(left=l_df, right=r_df, left_on='l_k', right_on='r_k', how='left') r_to_l_map = df['r_index'].to_numpy(dtype=np.int64) r_to_l_filt = np.logical_not(df['r_index'].isnull()).to_numpy() right_results = list() for irf, rf in enumerate(right_fields): rf_raw = val.raw_array_from_parameter( self, 'right_fields[{}]'.format(irf), rf) joined_field = ops.safe_map(rf_raw, r_to_l_map, r_to_l_filt) # joined_field = per._safe_map(rf_raw, r_to_l_map, r_to_l_filt) if right_writers is None: right_results.append(joined_field) else: right_writers[irf].data.write(joined_field) return right_results
def validate_temperature_v1(session, min_temp, max_temp, temps, temp_units, temp_set, dest_temps, dest_temps_valid, dest_temps_modified): """ Check the temperature field and convert fahrenheit to celsius. :param session: The Exetera session instance. :param min_temp: The minimal numeric value for temperature. :param max_temp: The maximum numeric value for temperature. :param temps: The 'temperature' column from assessments dataframe. :param temp_units: The 'temperature_unit' column from assessments dataframe. :param temp_set: A field marking if the temperature field is set. :param dest_temps: A destination field to write the temperature values to. :param dest_temps_valid: A destination field to indicates if the temperature is valid, e.g. in between minimum and maximum numeric values. :param dest_temps_modified: A destination field to indicates if the temperature has been modified here. """ raw_temps = val.raw_array_from_parameter(session, "temps", temps) raw_temp_set = val.raw_array_from_parameter(session, "temp_set", temp_set) raw_dest_temps = np.where(raw_temps > max_temp, (raw_temps - 32) / 1.8, raw_temps) raw_dest_temps_valid = raw_temp_set & (min_temp <= raw_dest_temps) & ( raw_dest_temps <= max_temp) raw_dest_temps_modified = raw_temps != raw_dest_temps dest_temps.data.write(raw_dest_temps) dest_temps_valid.data.write(raw_dest_temps_valid) dest_temps_modified.data.write(raw_dest_temps_modified)
def merge_right(self, left_on, right_on, left_fields=None, left_writers=None): l_key_raw = val.raw_array_from_parameter(self, 'left_on', left_on) l_index = np.arange(len(l_key_raw), dtype=np.int64) l_df = pd.DataFrame({'l_k': l_key_raw, 'l_index': l_index}) r_key_raw = val.raw_array_from_parameter(self, 'right_on', right_on) r_index = np.arange(len(r_key_raw), dtype=np.int64) r_df = pd.DataFrame({'r_k': r_key_raw, 'r_index': r_index}) df = pd.merge(left=r_df, right=l_df, left_on='r_k', right_on='l_k', how='left') l_to_r_map = df['l_index'].to_numpy(dtype='int64') l_to_r_filt = np.logical_not(df['l_index'].isnull()).to_numpy() left_results = list() for ilf, lf in enumerate(left_fields): lf_raw = val.raw_array_from_parameter( self, 'left_fields[{}]'.format(ilf), lf) joined_field = ops.safe_map(lf_raw, l_to_r_map, l_to_r_filt) if left_writers is None: left_results.append(joined_field) else: left_writers[ilf].data.write(joined_field) return left_results
def dataset_sort_index(self, sort_indices, index=None): """ Generate a sorted index based on a set of fields upon which to sort and an optional index to apply to the sort_indices :param sort_indices: a tuple or list of indices that determine the sorted order :param index: optional - the index by which the initial field should be permuted :return: the resulting index that can be used to permute unsorted fields """ val._check_all_readers_valid_and_same_type(sort_indices) r_readers = tuple(reversed(sort_indices)) raw_data = val.raw_array_from_parameter(self, 'readers', r_readers[0]) if index is None: raw_index = np.arange(len(raw_data)) else: raw_index = val.raw_array_from_parameter(self, 'index', index) acc_index = raw_index fdata = raw_data[acc_index] index = np.argsort(fdata, kind='stable') acc_index = acc_index[index] for r in r_readers[1:]: raw_data = val.raw_array_from_parameter(self, 'readers', r) fdata = raw_data[acc_index] index = np.argsort(fdata, kind='stable') acc_index = acc_index[index] return acc_index
def get_index(self, target, foreign_key, destination=None): print(' building patient_id index') t0 = time.time() target_lookup = dict() target_ = val.raw_array_from_parameter(self, "target", target) for i, v in enumerate(target_): target_lookup[v] = i print(f' target lookup built in {time.time() - t0}s') print(' perform initial index') t0 = time.time() foreign_key_elems = val.raw_array_from_parameter( self, "foreign_key", foreign_key) # foreign_key_index = np.asarray([target_lookup.get(i, -1) for i in foreign_key_elems], # dtype=np.int64) foreign_key_index = np.zeros(len(foreign_key_elems), dtype=np.int64) current_invalid = np.int64(operations.INVALID_INDEX) for i_k, k in enumerate(foreign_key_elems): index = target_lookup.get(k, current_invalid) if index >= operations.INVALID_INDEX: current_invalid += 1 target_lookup[k] = index foreign_key_index[i_k] = index print(f' initial index performed in {time.time() - t0}s') if destination is not None: if val.is_field_parameter(destination): destination.data.write(foreign_key_index) else: destination[:] = foreign_key_index else: return foreign_key_index
def join(self, destination_pkey, fkey_indices, values_to_join, writer=None, fkey_index_spans=None): if isinstance(destination_pkey, fld.IndexedStringField): raise ValueError( "'destination_pkey' must not be an indexed string field") if isinstance(fkey_indices, fld.IndexedStringField): raise ValueError( "'fkey_indices' must not be an indexed string field") if isinstance(values_to_join, rw.IndexedStringReader): raise ValueError( "Joins on indexed string fields are not supported") raw_fkey_indices = val.raw_array_from_parameter( self, "fkey_indices", fkey_indices) raw_values_to_join = val.raw_array_from_parameter( self, "values_to_join", values_to_join) # generate spans for the sorted key indices if not provided if fkey_index_spans is None: fkey_index_spans = self.get_spans(field=raw_fkey_indices) # select the foreign keys from the start of each span to get an ordered list # of unique id indices in the destination space that the results of the predicate # execution are mapped to unique_fkey_indices = raw_fkey_indices[fkey_index_spans[:-1]] # generate a filter to remove invalid foreign key indices (where values in the # foreign key don't map to any values in the destination space invalid_filter = unique_fkey_indices < operations.INVALID_INDEX safe_unique_fkey_indices = unique_fkey_indices[invalid_filter] # the predicate results are in the same space as the unique_fkey_indices, which # means they may still contain invalid indices, so filter those now safe_values_to_join = raw_values_to_join[invalid_filter] # now get the memory that the results will be mapped to #destination_space_values = writer.chunk_factory(len(destination_pkey)) destination_space_values = np.zeros(len(destination_pkey), dtype=raw_values_to_join.dtype) # finally, map the results from the source space to the destination space destination_space_values[ safe_unique_fkey_indices] = safe_values_to_join if writer is not None: writer.data.write(destination_space_values) else: return destination_space_values
def get_shared_index(self, keys): """ Create a shared index based on a tuple numpy arrays containing keys. This function generates the sorted union of a tuple of key fields and then maps the individual arrays to their corresponding indices in the sorted union. Example: key_1 = ['a', 'b', 'e', 'g', 'i'] key_2 = ['b', 'b', 'c', 'c, 'e', 'g', 'j'] key_3 = ['a', 'c' 'd', 'e', 'g', 'h', 'h', 'i'] sorted_union = ['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j'] key_1_index = [0, 1, 4, 5, 7] key_2_index = [1, 1, 2, 2, 4, 5, 8] key_3_index = [0, 2, 3, 4, 5, 6, 6, 7] :param keys: a tuple of groups, fields or ndarrays whose contents represent keys """ if not isinstance(keys, tuple): raise ValueError("'keys' must be a tuple") concatted = None raw_keys = list() for k in keys: raw_field = val.raw_array_from_parameter(self, 'keys', k) raw_keys.append(raw_field) if concatted is None: concatted = pd.unique(raw_field) else: concatted = np.concatenate((concatted, raw_field), axis=0) concatted = pd.unique(concatted) concatted = np.sort(concatted) return tuple(np.searchsorted(concatted, k) for k in raw_keys)
def _aggregate_impl(self, predicate, index, src=None, dest=None): index_ = val.raw_array_from_parameter(self, "index", index) dest_field = None if dest is not None: dest_field = val.field_from_parameter(self, "dest", dest) fkey_index_spans = self.get_spans(field=index) # execute the predicate (note that not every predicate requires a reader) results = predicate(fkey_index_spans, src, dest_field) return dest if dest is not None else results
def get_shared_index(self, keys): if not isinstance(keys, tuple): raise ValueError("'keys' must be a tuple") concatted = None for k in keys: raw_field = val.raw_array_from_parameter(self, 'keys', k) if concatted is None: concatted = pd.unique(raw_field) else: concatted = np.concatenate((concatted, raw_field), axis=0) concatted = pd.unique(concatted) concatted = np.sort(concatted) return tuple(np.searchsorted(concatted, k) for k in keys)
def distinct(self, field=None, fields=None, filter=None): if field is None and fields is None: return ValueError("One of 'field' and 'fields' must be set") if field is not None and fields is not None: return ValueError("Only one of 'field' and 'fields' may be set") if field is not None: field_ = val.raw_array_from_parameter(self, 'field', field) return np.unique(field_) entries = [(f'{i}', f.dtype) for i, f in enumerate(fields)] unified = np.empty_like(fields[0], dtype=np.dtype(entries)) for i, f in enumerate(fields): unified[f'{i}'] = f uniques = np.unique(unified) results = [uniques[f'{i}'] for i in range(len(fields))] return results
def index_spans(self, spans): raw_spans = val.raw_array_from_parameter(self, "spans", spans) results = np.zeros(raw_spans[-1], dtype=np.int64) return _index_spans(raw_spans, results)
def test_type_from_mechanism_v1(datastore, mechanism, mechanism_free, pcr_standard_answers, pcr_strong_inferred, pcr_weak_inferred, antibody_standard_answers, antibody_strong_inferred, antibody_weak_inferred): """ Classify the test mechanism by using data from the 'mechanism' field and user filled free text. :param datastore: The Exetera session instance. :param mechanism: The 'mechanism' column from the tests dataframe. :param mechanism_free: The 'mechanism_free' column from the tests dataframe. :param pcr_standard_answers: The field to indicate a standard pcr test performed. :param pcr_strong_inferred: The field to indicate a strong pcr test performed. :param pcr_weak_inferred: The field to indicate a weak pcr test performed. :param antibody_standard_answers: The field to indicate a standard antibody test performed. :param antibody_strong_inferred: The field to indicate a strong antibody test performed. :param antibody_weak_inferred: The field to indicate a weak antibody test performed. """ def search_for_substring(text_entries, pattern): filt = np.zeros(len(text_entries), np.bool) for ie, e in enumerate(text_entries): if pattern in e.lower(): filt[ie] = True return filt antigen_exclusions = ('nose_throat_swab', 'throat_swab', 'nose_swab', 'spit_tube') antibody_exclusions = ('blood_sample', 'blood_sample_finger_prick', 'blood_sample_needle_draw') antibody_strong = ('blood', 'antib', 'anti b', 'anti-b', 'prick', 'antikro', 'anti kro', 'blod', 'all three', 'all tests', 'all of') antibody_weak = ('prick', 'stick i f', 'finger') pcr_strong = ('swab', 'swap', 'swob', 'swan', 'tonsil', 'nose', 'throat', 'näsa', 'svalg', 'oral', 'nasoph', 'saliva', 'all three', 'all tests', 'all of', 'plasma', 'drive t', 'drivet') pcr_weak = ('self test', 'self admin', 'home test', 'home admin', 'self', 'home', 'post', 'i did it', 'drive', 'hemma', 'private') r_mechanism = val.raw_array_from_parameter(datastore, 'mechanism', mechanism) f_pcr_cat = np.isin(r_mechanism, (1, 2, 3, 4)) if isinstance(pcr_standard_answers, np.ndarray): pcr_standard_answers[:] = f_pcr_cat else: pcr_standard_answers.data.write(f_pcr_cat) f_atb_cat = np.isin(r_mechanism, (5, 6, 7)) if isinstance(antibody_standard_answers, np.ndarray): antibody_standard_answers[:] = f_atb_cat else: antibody_standard_answers.data.write(f_atb_cat) r_mechanism_free = val.raw_array_from_parameter(datastore, 'mechanism_free', mechanism_free) f_pcr_strong = np.zeros(len(r_mechanism), dtype=np.bool) for p in pcr_strong: filt = search_for_substring(r_mechanism_free, p) f_pcr_strong = f_pcr_strong | filt if isinstance(pcr_strong_inferred, np.ndarray): pcr_strong_inferred[:] = f_pcr_strong else: pcr_strong_inferred.data.write(f_pcr_strong) f_pcr_weak = np.zeros(len(r_mechanism), dtype=np.bool) for p in pcr_weak: filt = search_for_substring(r_mechanism_free, p) f_pcr_weak = f_pcr_weak | filt if isinstance(pcr_weak_inferred, np.ndarray): pcr_weak_inferred[:] = f_pcr_weak else: pcr_weak_inferred.data.write(f_pcr_weak) f_antibody_strong = np.zeros(len(r_mechanism), dtype=np.bool) for p in antibody_strong: filt = search_for_substring(r_mechanism_free, p) f_antibody_strong = f_antibody_strong | filt if isinstance(antibody_strong_inferred, np.ndarray): antibody_strong_inferred[:] = f_antibody_strong else: antibody_strong_inferred.data.write(f_antibody_strong) f_antibody_weak = np.zeros(len(r_mechanism), dtype=np.bool) for p in antibody_weak: filt = search_for_substring(r_mechanism_free, p) f_antibody_weak = f_antibody_weak | filt if isinstance(antibody_weak_inferred, np.ndarray): antibody_weak_inferred[:] = f_antibody_weak else: antibody_weak_inferred.data.write(f_antibody_weak)
def test_type_from_mechanism_v1_standard_input(s, test_df): mechanism = test_df['mechanism'] mechanism_free = test_df['mechanism_freetext'] pcr_standard_answers = test_df.create_numeric('pcr_standard_answers', 'bool').data pcr_strong_inferred = test_df.create_numeric('pcr_strong_inferred', 'bool').data pcr_weak_inferred = test_df.create_numeric('pcr_weak_inferred', 'bool').data antibody_standard_answers = test_df.create_numeric('antibody_standard_answers', 'bool').data antibody_strong_inferred = test_df.create_numeric('antibody_strong_inferred', 'bool').data antibody_weak_inferred = test_df.create_numeric('antibody_weak_inferred', 'bool').data def search_for_substring(text_entries, pattern): filt = np.zeros(len(text_entries), np.bool) for ie, e in enumerate(text_entries): if pattern in e.lower(): filt[ie] = True return filt antigen_exclusions = ('nose_throat_swab', 'throat_swab', 'nose_swab', 'spit_tube') antibody_exclusions = ('blood_sample', 'blood_sample_finger_prick', 'blood_sample_needle_draw') antibody_strong = ('blood', 'antib', 'anti b', 'anti-b', 'prick', 'antikro', 'anti kro', 'blod', 'all three', 'all tests', 'all of') antibody_weak = ('prick', 'stick i f', 'finger') pcr_strong = ('swab', 'swap', 'swob', 'swan', 'tonsil', 'nose', 'throat', 'näsa', 'svalg', 'oral', 'nasoph', 'saliva', 'all three', 'all tests', 'all of', 'plasma', 'drive t', 'drivet') pcr_weak = ('self test', 'self admin', 'home test', 'home admin', 'self', 'home', 'post', 'i did it', 'drive', 'hemma', 'private') r_mechanism = val.raw_array_from_parameter(s, 'mechanism', mechanism) f_pcr_cat = np.isin(r_mechanism, (1, 2, 3, 4)) if isinstance(pcr_standard_answers, np.ndarray): pcr_standard_answers[:] = f_pcr_cat else: pcr_standard_answers.write(f_pcr_cat) f_atb_cat = np.isin(r_mechanism, (5, 6, 7)) if isinstance(antibody_standard_answers, np.ndarray): antibody_standard_answers[:] = f_atb_cat else: antibody_standard_answers.write(f_atb_cat) # todo problem w/ changing ds to s, return what for indexing string field r_mechanism_free = val.raw_array_from_parameter(s, 'mechanism_free', mechanism_free) f_pcr_strong = np.zeros(len(r_mechanism), dtype=np.bool) for p in pcr_strong: filt = search_for_substring(r_mechanism_free, p) f_pcr_strong = f_pcr_strong | filt if isinstance(pcr_strong_inferred, np.ndarray): pcr_strong_inferred[:] = f_pcr_strong else: pcr_strong_inferred.write(f_pcr_strong) f_pcr_weak = np.zeros(len(r_mechanism), dtype=np.bool) for p in pcr_weak: filt = search_for_substring(r_mechanism_free, p) f_pcr_weak = f_pcr_weak | filt if isinstance(pcr_weak_inferred, np.ndarray): pcr_weak_inferred[:] = f_pcr_weak else: pcr_weak_inferred.write(f_pcr_weak) f_antibody_strong = np.zeros(len(r_mechanism), dtype=np.bool) for p in antibody_strong: filt = search_for_substring(r_mechanism_free, p) f_antibody_strong = f_antibody_strong | filt if isinstance(antibody_strong_inferred, np.ndarray): antibody_strong_inferred[:] = f_antibody_strong else: antibody_strong_inferred.write(f_antibody_strong) f_antibody_weak = np.zeros(len(r_mechanism), dtype=np.bool) for p in antibody_weak: filt = search_for_substring(r_mechanism_free, p) f_antibody_weak = f_antibody_weak | filt if isinstance(antibody_weak_inferred, np.ndarray): antibody_weak_inferred[:] = f_antibody_weak else: antibody_weak_inferred.write(f_antibody_weak) # count_in_exclusion = 0 # for r in r_mechanism_free: # if r.lower in antigen_exclusions: # count_in_exclusion += 1 # print(count_in_exclusion)