def test_interlock(self): dataset = self._get_dataset() data = dataset._data name, lab = 'q4AgeGen', 'q4 Age Gender' variables = [ 'q4', { 'age': [(1, '18-35', { 'age': frange('18-35') }), (2, '30-49', { 'age': frange('30-49') }), (3, '50+', { 'age': is_ge(50) })] }, 'gender' ] dataset.interlock(name, lab, variables) val = [1367, 1109, 1036, 831, 736, 579, 571, 550, 454, 438, 340, 244] ind = [ '10;', '8;', '9;', '7;', '3;', '8;10;', '1;', '4;', '2;', '7;9;', '1;3;', '2;4;' ] s = pd.Series(val, index=ind, name='q4AgeGen') self.assertTrue(all(s == data['q4AgeGen'].value_counts())) values = [(1, u'Yes/18-35/Male'), (2, u'Yes/18-35/Female'), (3, u'Yes/30-49/Male'), (4, u'Yes/30-49/Female'), (5, u'Yes/50+/Male'), (6, u'Yes/50+/Female'), (7, u'No/18-35/Male'), (8, u'No/18-35/Female'), (9, u'No/30-49/Male'), (10, u'No/30-49/Female'), (11, u'No/50+/Male'), (12, u'No/50+/Female')] text = 'q4 Age Gender' self.assertEqual(values, dataset.values('q4AgeGen')) self.assertEqual(text, dataset.text('q4AgeGen')) self.assertTrue(dataset.is_delimited_set('q4AgeGen'))
def test_derotate_df(self): dataset = self._get_dataset() levels = {'visit': ['visit_1', 'visit_2', 'visit_3']} mapper = [{'q14r{:02}'.format(r): ['q14r{0:02}c{1:02}'.format(r, c) for c in range(1, 4)]} for r in frange('1-5')] ds = dataset.derotate(levels, mapper, 'gender', 'record_number') df_h = ds._data.head(10) df_val = [[x if not np.isnan(x) else 'nan' for x in line] for line in df_h.values.tolist()] result_df = [[1.0, 2.0, 1.0, 4.0, 4.0, 4.0, 8.0, 1.0, 2.0, 4.0, 2.0, 3.0, 1.0], [1.0, 2.0, 2.0, 4.0, 4.0, 4.0, 8.0, 3.0, 3.0, 2.0, 4.0, 3.0, 1.0], [1.0, 3.0, 1.0, 1.0, 1.0, 8.0, 'nan', 4.0, 3.0, 1.0, 3.0, 1.0, 2.0], [1.0, 4.0, 1.0, 5.0, 5.0, 4.0, 8.0, 2.0, 3.0, 2.0, 3.0, 1.0, 1.0], [1.0, 4.0, 2.0, 4.0, 5.0, 4.0, 8.0, 2.0, 1.0, 3.0, 2.0, 1.0, 1.0], [1.0, 5.0, 1.0, 3.0, 3.0, 5.0, 8.0, 4.0, 2.0, 2.0, 1.0, 3.0, 1.0], [1.0, 5.0, 2.0, 5.0, 3.0, 5.0, 8.0, 3.0, 3.0, 3.0, 1.0, 2.0, 1.0], [1.0, 6.0, 1.0, 2.0, 2.0, 8.0, 'nan', 4.0, 2.0, 3.0, 4.0, 2.0, 1.0], [1.0, 7.0, 1.0, 3.0, 3.0, 3.0, 8.0, 2.0, 1.0, 3.0, 2.0, 4.0, 1.0], [1.0, 7.0, 2.0, 3.0, 3.0, 3.0, 8.0, 3.0, 2.0, 1.0, 2.0, 3.0, 1.0]] result_columns = ['@1', 'record_number', 'visit', 'visit_levelled', 'visit_1', 'visit_2', 'visit_3', 'q14r01', 'q14r02', 'q14r03', 'q14r04', 'q14r05', 'gender'] df_len = 18520 self.assertEqual(df_val, result_df) self.assertEqual(df_h.columns.tolist(), result_columns) self.assertEqual(len(ds._data.index), df_len) path_json = '{}/{}.json'.format(ds.path, ds.name) path_csv = '{}/{}.csv'.format(ds.path, ds.name) os.remove(path_json) os.remove(path_csv)
def test_add_y_on_y(self): batch, ds = _get_batch('test', full=True) b_meta = _get_meta(batch) batch.add_y_on_y('cross', {'age': frange('20-30')}, 'extend') batch.add_y_on_y('back', None, 'replace') self.assertEqual(b_meta['y_filter_map']['back'], None) self.assertEqual(b_meta['y_on_y'], ['cross', 'back'])
def test_derotate_meta(self): dataset = self._get_dataset() levels = {'visit': ['visit_1', 'visit_2', 'visit_3']} mapper = [{'q14r{:02}'.format(r): ['q14r{0:02}c{1:02}'.format(r, c) for c in range(1, 4)]} for r in frange('1-5')] ds = dataset.derotate(levels, mapper, 'gender', 'record_number') err = ds.validate(False) err_s = None self.assertEqual(err_s, err) path_json = '{}/{}.json'.format(ds.path, ds.name) path_csv = '{}/{}.csv'.format(ds.path, ds.name) os.remove(path_json) os.remove(path_csv)
def test_from_batch(self): ds = _get_dataset() ds.force_texts('de-DE', 'en-GB') batch1, ds = _get_batch('test1', ds, full=True) batch1.set_language('de-DE') batch1.hiding('q1', frange('8,9,96-99')) batch1.slicing('q1', frange('9-4')) batch2, ds = _get_batch('test2', ds) batch2.add_downbreak('q1') batch2.add_crossbreak('Wave') batch2.as_addition('test1') n_ds = ds.from_batch('test1', 'RecordNo', 'de-DE', True, 'variables') self.assertEqual(n_ds.codes('q1'), [7, 6, 5, 4]) self.assertEqual(n_ds.variables(), [ u'age', u'gender', u'q1', u'q2', u'q6', u'q8a', u'q9a', u'Wave', u'weight_a', u'RecordNo' ]) self.assertEqual(n_ds['gender'].value_counts().values.tolist(), [3952]) self.assertEqual(n_ds.value_texts('gender', 'en-GB'), [None, None]) self.assertEqual(n_ds.value_texts('gender', 'de-DE'), [u'Male', u'Female']) self.assertRaises(ValueError, ds.from_batch, 'test1', 'RecordNo', 'fr-FR')
def test_extend_filter(self): batch, ds = _get_batch('test', full=True) b_meta = _get_meta(batch) ext_filters = { 'q1': { 'age': frange('20-25') }, ('q2', 'q6'): { 'age': frange('30-35') } } batch.extend_filter(ext_filters) filter_names = [ 'men_only', 'men_only_q1', 'men_only_q2', 'men_only_q6' ] self.assertEqual(b_meta['filter_names'], filter_names) x_filter_map = OrderedDict([('q1', 'men_only_q1'), ('q2', 'men_only_q2'), ('q6', 'men_only_q6'), ('q6_1', 'men_only_q6'), ('q6_2', 'men_only_q6'), ('q6_3', 'men_only_q6'), ('age', 'men_only')]) self.assertEqual(b_meta['x_filter_map'], x_filter_map)
def show_items(self, array, text_key=None): """ Display items of arrays in different DataSets. Parameters ---------- array: str/ list of str Displays items for these variables. text_key: str Text key for text-based label information. Can be provided as ``'x edits~tk'`` or ``'y edits~tk'``, then the edited text is taken. If None is provided, the item name will be diplayed instead of the the item label. """ if not text_key: label = False etk = None else: label = True text_key = text_key.split('~') etk = text_key[1].split()[0] if len(text_key) > 1 else None text_key = text_key[0] df_all_v = [] for a in array: if not self._is_array(a): raise ValueError('{} is not an array.'.format(a)) all_df = [] for name in list(self.ds_alias.values()): ds = self[name] if a in ds: if label: val = [ ds.text(s, True, text_key, etk) for s in ds.sources(a) ] ind = ds.sources(a) else: val = ds.sources(a) ind = frange('1-{}'.format(len(val))) index = pd.MultiIndex.from_tuples([(a, n) for n in ind]) df = pd.DataFrame(val, index=index, columns=[name]) all_df.append(df) all_df = pd.concat(all_df, axis=1) df_all_v.append(all_df) if not df_all_v: print('No variables to show.') else: return pd.concat(df_all_v, axis=0)
def test_filter(self): dataset = self._get_dataset() f = intersection([{'gender': [2]}, {'age': frange('35-45')}]) alias = 'men: 35 to 45 years old' dataset.filter(alias, f, inplace=True) # alias copied correctly? self.assertEqual(dataset.filtered, alias) # correctly sliced? expected_index_len = 1509 self.assertEqual(len(dataset._data.index), expected_index_len) self.assertEqual(dataset['age'].value_counts().sum(), expected_index_len) expected_gender_codes = [2] expected_age_codes = [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45] self.assertTrue(dataset['gender'].value_counts().index.tolist() == expected_gender_codes) self.assertTrue(sorted(dataset['age'].value_counts().index.tolist()) == expected_age_codes)
def test_derotate_freq(self): dataset = self._get_dataset() levels = {'visit': ['visit_1', 'visit_2', 'visit_3']} mapper = [{'q14r{:02}'.format(r): ['q14r{0:02}c{1:02}'.format(r, c) for c in range(1, 4)]} for r in frange('1-5')] ds = dataset.derotate(levels, mapper, 'gender', 'record_number') val_c = {'visit': {'val': {1: 8255, 2: 6174, 3: 4091}, 'index': [1, 2, 3]}, 'visit_levelled': {'val': {4: 3164, 1: 3105, 5: 3094, 6: 3093, 3: 3082, 2: 2982}, 'index': [4, 1, 5, 6, 3,2]}, 'visit_1': {'val': {4: 3225, 6: 3136, 3: 3081, 2: 3069, 1: 3029, 5: 2980}, 'index': [4, 6, 3, 2, 1, 5]}, 'visit_2': {'val': {1: 2789, 6: 2775, 5: 2765, 3: 2736, 4: 2709, 2: 2665, 8: 2081}, 'index': [1, 6, 5, 3, 4, 2, 8]}, 'visit_3': {'val': {8: 4166, 5: 2181, 4: 2112, 3: 2067, 1: 2040, 6: 2001, 2: 1872}, 'index': [8, 5, 4, 3, 1, 6, 2]}, 'q14r01': {'val': {3: 4683, 1: 4653, 4: 4638, 2: 4546}, 'index': [3, 1, 4, 2]}, 'q14r02': {'val': {4: 4749, 2: 4622, 1: 4598, 3: 4551}, 'index': [4, 2, 1, 3]}, 'q14r03': {'val': {1: 4778, 4: 4643, 3: 4571, 2: 4528}, 'index': [1, 4, 3, 2]}, 'q14r04': {'val': {1: 4665, 2: 4658, 4: 4635, 3: 4562}, 'index': [1, 2, 4, 3]}, 'q14r05': {'val': {2: 4670, 4: 4642, 1: 4607, 3: 4601}, 'index': [2, 4, 1, 3]}, 'gender': {'val': {2: 9637, 1: 8883}, 'index': [2, 1]}} for var in val_c.keys(): series = pd.Series(val_c[var]['val'], index = val_c[var]['index']) compare = all(series == ds._data[var].value_counts()) self.assertTrue(compare) path_json = '{}/{}.json'.format(ds.path, ds.name) path_csv = '{}/{}.csv'.format(ds.path, ds.name) os.remove(path_json) os.remove(path_csv)
def test_extend_filter(self): batch, ds = _get_batch('test', full=True) b_meta = _get_meta(batch) ext_filters = { 'q1': { 'age': frange('20-25') }, ('q2', 'q6'): { 'age': frange('30-35') } } batch.extend_filter(ext_filters) filter_names = [ 'men only', '(men only)+(q1)', '(men only)+(q2)', '(men only)+(q6)', '(men only)+(q6_1)', '(men only)+(q6_2)', '(men only)+(q6_3)' ] self.assertEqual(b_meta['filter_names'], filter_names) x_filter_map = OrderedDict([('q1', { '(men only)+(q1)': intersection([{ 'gender': 1 }, { 'age': [20, 21, 22, 23, 24, 25] }]) }), ('q2', { '(men only)+(q2)': intersection([{ 'gender': 1 }, { 'age': [30, 31, 32, 33, 34, 35] }]) }), ('q6', { '(men only)+(q6)': intersection([{ 'gender': 1 }, { 'age': [30, 31, 32, 33, 34, 35] }]) }), (u'q6_1', { '(men only)+(q6_1)': intersection([{ 'gender': 1 }, { 'age': [30, 31, 32, 33, 34, 35] }]) }), (u'q6_2', { '(men only)+(q6_2)': intersection([{ 'gender': 1 }, { 'age': [30, 31, 32, 33, 34, 35] }]) }), (u'q6_3', { '(men only)+(q6_3)': intersection([{ 'gender': 1 }, { 'age': [30, 31, 32, 33, 34, 35] }]) }), ('age', { 'men only': { 'gender': 1 } })]) self.assertEqual(b_meta['x_filter_map'], x_filter_map)