def main(): assert not invalid_credentials() # Login. site = pycrunch.connect(CRUNCH_USER, CRUNCH_PASSWORD, CRUNCH_URL) assert isinstance(site, pycrunch.shoji.Catalog) # Create the test dataset. dataset = site.datasets.create(DATASET_DOC).refresh() assert isinstance(dataset, pycrunch.shoji.Entity) try: # Load initial data. pycrunch.importing.importer.append_rows(dataset, ROWS) # Check the initial number of rows. df = pandaslib.dataframe(dataset) assert len(df) == len(ROWS) - 1 # excluding the header # 1. Exclusion Filter Integration Tests # 1.1 Set a simple exclusion filter. pycrunch.datasets.exclusion(dataset, 'identity > 5') df = pandaslib.dataframe(dataset) assert len(df) == 5 # 1.2 More complex exclusion filters involving a categorical variable. expr = 'speak_spanish in [32766]' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 10 expr = 'speak_spanish in (32766, 32767)' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 9 expr = 'not (speak_spanish in (1, 2) and operating_system == "Linux")' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 2 # 1.3 Exclusion filters with `has_any`. expr = 'hobbies.has_any([32766])' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 8 expr = 'not hobbies.has_any([32766])' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 4 expr = 'hobbies.has_any([32766, 32767])' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 7 expr = 'music.has_any([32766])' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 12 expr = 'music.has_any([1])' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 1 expr = 'music.has_any([1, 2])' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 0 # 1.4 Exclusion filters with `has_all`. expr = 'hobbies.has_all([32767])' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 11 expr = 'not hobbies.has_all([32767])' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 1 expr = 'music.has_all([1])' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 11 expr = 'music.has_all([1]) or music.has_all([2])' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 10 expr = 'not ( music.has_all([1]) or music.has_all([2]) )' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 2 # 1.5 Exclusion filters with `duplicates`. expr = 'ip_address.duplicates()' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 10 # 1.6 Exclusion filters with `valid` and `missing`. expr = 'valid(speak_spanish)' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 3 expr = 'not valid(speak_spanish)' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 9 expr = 'missing(speak_spanish)' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 9 expr = 'missing(hobbies)' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 11 expr = 'not missing(hobbies)' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 1 expr = 'valid(hobbies)' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 5 expr = 'not valid(hobbies)' pycrunch.datasets.exclusion(dataset, expr) df = pandaslib.dataframe(dataset) assert len(df) == 7 # 1.7 Clear the exclusion filter. pycrunch.datasets.exclusion(dataset) df = pandaslib.dataframe(dataset) assert len(df) == len(ROWS) - 1 # excluding the header # 2. Integration Tests for "Transformations". categories = [ {'id': 1, 'name': 'Nerds', 'numeric_value': 1, 'missing': False}, {'id': 2, 'name': 'Normal Users', 'numeric_value': 2, 'missing': False}, {'id': 3, 'name': 'Hipsters', 'numeric_value': 3, 'missing': False}, {'id': 32767, 'name': 'Unknown', 'numeric_value': None, 'missing': True} ] rules = [ 'operating_system in ("Linux", "Solaris", "Minix", "FreeBSD", "NetBSD")', 'operating_system == "Windows"', 'operating_system == "MacOS"', 'missing(operating_system)' ] new_var = create_categorical( ds=dataset, categories=categories, rules=rules, name='Operating System Users', alias='operating_system_users', description='Type of Operating System Users' ) assert isinstance(new_var, pycrunch.shoji.Entity) new_var.refresh() assert new_var.body.type == 'categorical' # Check the data on the new variable. df = pandaslib.dataframe(dataset) assert 'operating_system_users' in df # Check the nerds. assert len(df[df['operating_system_users'] == 'Nerds']) == 8 assert set( r['operating_system'] for _, r in df[df['operating_system_users'] == 'Nerds'].iterrows() ) == {'Linux', 'Solaris', 'Minix', 'FreeBSD', 'NetBSD'} # Check the hipsters. assert len(df[df['operating_system_users'] == 'Hipsters']) == 1 assert set( r['operating_system'] for _, r in df[df['operating_system_users'] == 'Hipsters'].iterrows() ) == {'MacOS'} # Check normal users. assert len(df[df['operating_system_users'] == 'Normal Users']) == 3 assert set( r['operating_system'] for _, r in df[df['operating_system_users'] == 'Normal Users'].iterrows() ) == {'Windows'} # 3. Integration Tests for "Recodes". # 3.1 combine_categories. # On a 'categorical' variable. cat_map = { 1: { 'name': 'Bilingual', 'missing': False, 'combined_ids': [2, 3] }, 2: { 'name': 'Not Bilingual', 'missing': False, 'combined_ids': [1, 4] }, 99: { 'name': 'Unknown', 'missing': True, 'combined_ids': [32766, 32767] } } new_var = combine_categories( dataset, 'speak_spanish', cat_map, 'Bilingual Person', 'bilingual' ) assert isinstance(new_var, pycrunch.shoji.Entity) new_var.refresh() assert new_var.body.type == 'categorical' df = pandaslib.dataframe(dataset) assert 'bilingual' in df # Check the data in the recoded variable. assert len(df[df['bilingual'] == 'Bilingual']) == 5 assert set( int(r['identity']) for _, r in df[df['bilingual'] == 'Bilingual'].iterrows() ) == {3, 4, 10, 11, 12} assert len(df[df['bilingual'] == 'Not Bilingual']) == 4 assert set( int(r['identity']) for _, r in df[df['bilingual'] == 'Not Bilingual'].iterrows() ) == {1, 2, 5, 6} assert len(df[df['bilingual'].isnull()]) == 3 assert set( int(r['identity']) for _, r in df[df['bilingual'].isnull()].iterrows() ) == {7, 8, 9} # On a 'categorical_array' variable. cat_map = { 1: { 'name': 'Interested', 'missing': False, 'combined_ids': [1, 2] }, 2: { 'name': 'Not interested', 'missing': False, 'combined_ids': [3, 4] }, 99: { 'name': 'Unknown', 'missing': True, 'combined_ids': [32766, 32767] } } new_var = combine_categories( dataset, 'hobbies', cat_map, 'Hobbies (recoded)', 'hobbies_recoded' ) assert isinstance(new_var, pycrunch.shoji.Entity) new_var.refresh() assert new_var.body.type == 'categorical_array' df = pandaslib.dataframe(dataset) assert 'hobbies_recoded' in df # Check the data in the recoded variable. for _, row in df[['hobbies', 'hobbies_recoded']].iterrows(): hobbies = row['hobbies'] hobbies_rec = row['hobbies_recoded'] assert len(hobbies) == len(hobbies_rec) for i, value in enumerate(hobbies): if value in ({'?': 32766}, {'?': 32767}): assert hobbies_rec[i] == {'?': 99} elif value in (1, 2): assert hobbies_rec[i] == 1 elif value in (3, 4): assert hobbies_rec[i] == 2 # 3.2 combine_responses. response_map = { 'music_recoded_1': ['music_1', 'music_2'], 'music_recoded_2': ['music_97'], 'music_recoded_3': ['music_98', 'music_99'] } new_var = combine_responses( dataset, 'music', response_map, 'Music (alt)', 'music_recoded' ) assert isinstance(new_var, pycrunch.shoji.Entity) new_var.refresh() assert new_var.body.type == 'multiple_response' df = pandaslib.dataframe(dataset) assert 'music_recoded' in df # TODO: Test the data in the recoded variable. Unsure of its meaning. finally: dataset.delete()
def main(): assert not invalid_credentials() # Login. site = pycrunch.connect(CRUNCH_USER, CRUNCH_PASSWORD, CRUNCH_URL) assert isinstance(site, pycrunch.shoji.Catalog) # Create the test dataset. dataset_resource = site.datasets.create(DATASET_DOC).refresh() assert isinstance(dataset_resource, pycrunch.shoji.Entity) dataset = Dataset(dataset_resource) try: # Load initial data. pycrunch.importing.importer.append_rows(dataset.resource, ROWS) # Check the initial number of rows. df = pandaslib.dataframe(dataset.resource) assert len(df) == len(ROWS) - 1 # excluding the header # 1. Exclusion Filter Integration Tests # 1.1 Set a simple exclusion filter. dataset.exclude('identity > 5') df = pandaslib.dataframe(dataset.resource) assert len(df) == 5 assert not any(r['identity'] > 5 for _, r in df.iterrows()) # 1.2 More complex exclusion filters involving a categorical variable. expr = 'speak_spanish in [32766]' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] != 32766 ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids expr = 'speak_spanish in (32766, 32767)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] not in (32766, 32767) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert not isnan(row['speak_spanish']) expr = 'not (speak_spanish in (1, 2) and operating_system == "Linux")' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] in (1, 2) and row[2] == 'Linux' ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['speak_spanish'] in \ ('I speak Spanish primarily', 'I speak both Spanish and English equally') assert row['operating_system'] == 'Linux' # 1.3 Exclusion filters with `any`. expr = 'hobbies.any([32766])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 32766 not in row[5:9] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} not in row['hobbies'] expr = 'not hobbies.any([32766])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 32766 in row[5:9] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} in row['hobbies'] expr = 'hobbies.any([32766, 32767])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 32766 not in row[5:9] and 32767 not in row[5:9] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} not in row['hobbies'] and \ {'?': 32767} not in row['hobbies'] expr = 'music.any([32766])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 32766 not in row[9:14] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} not in row['music'] expr = 'music.any([1])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 1 not in row[9:14] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert 1 not in row['music'] expr = 'music.any([1, 2])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 1 not in row[9:14] and 2 not in row[9:14] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert 1 not in row['music'] and 2 not in row['music'] # 1.4 Exclusion filters with `all`. expr = 'hobbies.all([32767])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[5:9] != [32767, 32767, 32767, 32767] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'] != [{ '?': 32767 }, { '?': 32767 }, { '?': 32767 }, { '?': 32767 }] expr = 'not hobbies.all([32767])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[5:9] == [32767, 32767, 32767, 32767] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'] == [{ '?': 32767 }, { '?': 32767 }, { '?': 32767 }, { '?': 32767 }] expr = 'music.all([1])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[9:14] != [1, 1, 1, 1, 1] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['music'] != [1, 1, 1, 1, 1] expr = 'music.all([1]) or music.all([2])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and ( row[9:14] != [1, 1, 1, 1, 1] and row[9:14] != [2, 2, 2, 2, 2]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['music'] != [1, 1, 1, 1, 1] and \ row['music'] != [2, 2, 2, 2, 2] expr = 'not ( music.all([1]) or music.all([2]) )' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and ( row[9:14] == [1, 1, 1, 1, 1] or row[9:14] == [2, 2, 2, 2, 2]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['music'] == [1, 1, 1, 1, 1] or \ row['music'] == [2, 2, 2, 2, 2] # 1.5 Exclusion filters with `duplicates`. expr = 'ip_address.duplicates()' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) seen_ip_addresses = [] for _, row in df.iterrows(): assert row['ip_address'] not in seen_ip_addresses seen_ip_addresses.append(row['ip_address']) # 1.6 Exclusion filters with `valid` and `missing`. expr = 'valid(speak_spanish)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] in (32766, 32767) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert isnan(row['speak_spanish']) expr = 'not valid(speak_spanish)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] not in (32766, 32767) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert not isnan(row['speak_spanish']) expr = 'missing(speak_spanish)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] not in (32766, 32767) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert not isnan(row['speak_spanish']) expr = 'missing(hobbies)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and ( row[5:9] != [32766, 32766, 32766, 32766] and row[5:9] != [32767, 32767, 32767, 32767]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'] != [{'?': 32766}, {'?': 32766}, {'?': 32766}, {'?': 32766}] \ and row['hobbies'] != [{'?': 32767}, {'?': 32767}, {'?': 32767}, {'?': 32767}] expr = 'not missing(hobbies)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and ( row[5:9] == [32766, 32766, 32766, 32766] or row[5:9] == [32767, 32767, 32767, 32767]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'] == [{'?': 32766}, {'?': 32766}, {'?': 32766}, {'?': 32766}] \ or row['hobbies'] == [{'?': 32767}, {'?': 32767}, {'?': 32767}, {'?': 32767}] expr = 'valid(hobbies)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and ( 32766 in row[5:9] or 32767 in row[5:9]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} in row['hobbies'] or \ {'?': 32767} in row['hobbies'] expr = 'not valid(hobbies)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and ( 32766 not in row[5:9] and 32767 not in row[5:9]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} not in row['hobbies'] and \ {'?': 32767} not in row['hobbies'] # 1.7 Exclusion filter that refers to a subvariable by alias. expr = 'hobbies_1 == 4' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[5] != 4 ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'][0] != 4 # 1.8 Complex exclusion filters (multiple rules) expr = ( '(religion != 1 and (not valid(speak_spanish) or speak_spanish >= 1)) ' 'or (religion == 1 and speak_spanish == 2) ' 'or (religion == 3 and speak_spanish == 4)') dataset.exclude(expr) # 1.9 Exclusion filters using date variables. dt_str = '2014-12-30T00:00:00+00:00' dt = isodate.parse_datetime(dt_str) expr = 'registration_time < "%s"' % dt_str dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and isodate.parse_datetime(row[3]) >= dt ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids dt_str = '2015-01-01T00:00:00+00:00' dt = isodate.parse_datetime(dt_str) expr = 'registration_time >= "%s"' % dt_str dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and isodate.parse_datetime(row[3]) < dt ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids dt_str = '2014-05-10T00:00:00+00:00' dt = isodate.parse_datetime(dt_str) expr = 'registration_time == "%s"' % dt_str dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and isodate.parse_datetime(row[3]) != dt ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids dt_str = '2014-05-10T00:00:00+00:00' dt = isodate.parse_datetime(dt_str) expr = 'not(registration_time == "%s")' % dt_str dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and isodate.parse_datetime(row[3]) == dt ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids # 1.10 Clear the exclusion filter. dataset.exclude() df = pandaslib.dataframe(dataset.resource) assert len(df) == len(ROWS) - 1 # excluding the header # 2. Integration Tests for "Transformations". categories = [{ 'id': 1, 'name': 'Nerds', 'numeric_value': 1, 'missing': False, 'case': 'operating_system in ("Linux", "Solaris", "Minix", "FreeBSD", "NetBSD")', }, { 'id': 2, 'name': 'Normal Users', 'numeric_value': 2, 'missing': False, 'case': 'operating_system == "Windows"', }, { 'id': 3, 'name': 'Hipsters', 'numeric_value': 3, 'missing': False, 'case': 'operating_system == "MacOS"', }, { 'id': 32767, 'name': 'Unknown', 'numeric_value': None, 'missing': True, 'case': 'missing(operating_system)' }] new_var = dataset.create_single_response( categories=categories, name='Operating System Users', alias='operating_system_users', description='Type of Operating System Users') assert isinstance(new_var, Variable) assert new_var.type == 'categorical' # Check the data on the new variable. df = pandaslib.dataframe(dataset.resource) assert 'operating_system_users' in df # Check the nerds. assert set(r['operating_system'] for _, r in df[ df['operating_system_users'] == 'Nerds'].iterrows()) == { 'Linux', 'Solaris', 'Minix', 'FreeBSD', 'NetBSD' } # Check the hipsters. assert set(r['operating_system'] for _, r in df[df['operating_system_users'] == 'Hipsters'].iterrows()) == {'MacOS'} # Check normal users. assert set(r['operating_system'] for _, r in df[df['operating_system_users'] == 'Normal Users'].iterrows()) == {'Windows'} # 3. Integration Tests for "Recodes". # 3.1 combine_categories. # On a 'categorical' variable. cat_map = {1: [2, 3], 2: [1, 4], 99: [32766, 32767]} cat_names = {1: 'Bilingual', 2: 'Not Bilingual', 99: 'Unknown'} new_var = dataset.combine_categorical('speak_spanish', map=cat_map, categories=cat_names, name='Bilingual Person', alias='bilingual', missing=[99]) assert isinstance(new_var, Variable) assert new_var.type == 'categorical' df = pandaslib.dataframe(dataset.resource) assert 'bilingual' in df # Check the data in the recoded variable. bilingual_ids = set(row[0] for row in ROWS if row[0] != 'identity' and row[4] in (2, 3)) assert set( int(r['identity']) for _, r in df[ df['bilingual'] == 'Bilingual'].iterrows()) == bilingual_ids non_bilingual_ids = set(row[0] for row in ROWS if row[0] != 'identity' and row[4] in (1, 4)) assert set( int(r['identity']) for _, r in df[df['bilingual'] == 'Not Bilingual'].iterrows()) == non_bilingual_ids bilingual_null_ids = set( row[0] for row in ROWS if row[0] != 'identity' and row[4] in (32766, 32767)) assert set( int(r['identity']) for _, r in df[ df['bilingual'].isnull()].iterrows()) == bilingual_null_ids # On a 'categorical_array' variable. cat_map = {1: [1, 2], 2: [3, 4], 99: [32766, 32767]} cat_names = { 1: 'Interested', 2: 'Not interested', 99: 'Unknown', } new_var = dataset.combine_categorical('hobbies', map=cat_map, categories=cat_names, name='Hobbies (recoded)', alias='hobbies_recoded', missing=[99]) assert isinstance(new_var, Variable) assert new_var.type == 'categorical_array' df = pandaslib.dataframe(dataset.resource) assert 'hobbies_recoded' in df # Check the data in the recoded variable. for _, row in df.iterrows(): hobbies = row['hobbies'] hobbies_rec = row['hobbies_recoded'] assert len(hobbies) == len(hobbies_rec) for i, value in enumerate(hobbies): if value in ({'?': 32766}, {'?': 32767}): assert hobbies_rec[i] == {'?': 99} elif value in (1, 2): assert hobbies_rec[i] == 1 elif value in (3, 4): assert hobbies_rec[i] == 2 # 3.2 combine_responses. cat_map = {1: [1, 2], 2: [97], 3: [98, 99]} cat_names = { 1: 'music_recoded_1', 2: 'music_recoded_2', 3: 'music_recoded_3' } new_var = dataset.combine_multiple_response('music', map=cat_map, categories=cat_names, name='Music (alt)', alias='music_recoded') assert isinstance(new_var, Variable) assert new_var.type == 'multiple_response' df = pandaslib.dataframe(dataset.resource) assert 'music_recoded' in df # TODO: Test the data in the recoded variable. Unsure of its meaning. finally: dataset.resource.delete()
def main(): assert not invalid_credentials() # Login. site = pycrunch.connect(CRUNCH_USER, CRUNCH_PASSWORD, CRUNCH_URL) assert isinstance(site, pycrunch.shoji.Catalog) # Create the test dataset. dataset = site.datasets.create(DATASET_DOC).refresh() assert isinstance(dataset, pycrunch.shoji.Entity) try: # Load initial data. pycrunch.importing.importer.append_rows(dataset, ROWS) # Check the initial number of rows. df = pandaslib.dataframe(dataset) assert len(df) == len(ROWS) - 1 # excluding the header # 1. Exclusion Filter Integration Tests # 1.1 Set a simple exclusion filter. dataset.exclude('identity > 5') df = pandaslib.dataframe(dataset) assert len(df) == 5 assert not any(r['identity'] > 5 for _, r in df.iterrows()) # 1.2 More complex exclusion filters involving a categorical variable. expr = 'speak_spanish in [32766]' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] != 32766 ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids expr = 'speak_spanish in (32766, 32767)' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] not in (32766, 32767) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert not isnan(row['speak_spanish']) expr = 'not (speak_spanish in (1, 2) and operating_system == "Linux")' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] in (1, 2) and row[2] == 'Linux' ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['speak_spanish'] in \ ('I speak Spanish primarily', 'I speak both Spanish and English equally') assert row['operating_system'] == 'Linux' # 1.3 Exclusion filters with `has_any`. expr = 'hobbies.has_any([32766])' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 32766 not in row[5:9] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} not in row['hobbies'] expr = 'not hobbies.has_any([32766])' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 32766 in row[5:9] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} in row['hobbies'] expr = 'hobbies.has_any([32766, 32767])' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 32766 not in row[5:9] and 32767 not in row[5:9] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} not in row['hobbies'] and \ {'?': 32767} not in row['hobbies'] expr = 'music.has_any([32766])' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 32766 not in row[9:14] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} not in row['music'] expr = 'music.has_any([1])' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 1 not in row[9:14] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert 1 not in row['music'] expr = 'music.has_any([1, 2])' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 1 not in row[9:14] and 2 not in row[9:14] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert 1 not in row['music'] and 2 not in row['music'] # 1.4 Exclusion filters with `has_all`. expr = 'hobbies.has_all([32767])' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[5:9] != [32767, 32767, 32767, 32767] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'] != [{'?': 32767}, {'?': 32767}, {'?': 32767}, {'?': 32767}] expr = 'not hobbies.has_all([32767])' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[5:9] == [32767, 32767, 32767, 32767] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'] == [{'?': 32767}, {'?': 32767}, {'?': 32767}, {'?': 32767}] expr = 'music.has_all([1])' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[9:14] != [1, 1, 1, 1, 1] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['music'] != [1, 1, 1, 1, 1] expr = 'music.has_all([1]) or music.has_all([2])' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and (row[9:14] != [1, 1, 1, 1, 1] and row[9:14] != [2, 2, 2, 2, 2]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['music'] != [1, 1, 1, 1, 1] and \ row['music'] != [2, 2, 2, 2, 2] expr = 'not ( music.has_all([1]) or music.has_all([2]) )' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and (row[9:14] == [1, 1, 1, 1, 1] or row[9:14] == [2, 2, 2, 2, 2]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['music'] == [1, 1, 1, 1, 1] or \ row['music'] == [2, 2, 2, 2, 2] # 1.5 Exclusion filters with `duplicates`. expr = 'ip_address.duplicates()' dataset.exclude(expr) df = pandaslib.dataframe(dataset) seen_ip_addresses = [] for _, row in df.iterrows(): assert row['ip_address'] not in seen_ip_addresses seen_ip_addresses.append(row['ip_address']) # 1.6 Exclusion filters with `valid` and `missing`. expr = 'valid(speak_spanish)' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] in (32766, 32767) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert isnan(row['speak_spanish']) expr = 'not valid(speak_spanish)' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] not in (32766, 32767) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert not isnan(row['speak_spanish']) expr = 'missing(speak_spanish)' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] not in (32766, 32767) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert not isnan(row['speak_spanish']) expr = 'missing(hobbies)' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and (row[5:9] != [32766, 32766, 32766, 32766] and row[5:9] != [32767, 32767, 32767, 32767]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'] != [{'?': 32766}, {'?': 32766}, {'?': 32766}, {'?': 32766}] \ and row['hobbies'] != [{'?': 32767}, {'?': 32767}, {'?': 32767}, {'?': 32767}] expr = 'not missing(hobbies)' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and (row[5:9] == [32766, 32766, 32766, 32766] or row[5:9] == [32767, 32767, 32767, 32767]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'] == [{'?': 32766}, {'?': 32766}, {'?': 32766}, {'?': 32766}] \ or row['hobbies'] == [{'?': 32767}, {'?': 32767}, {'?': 32767}, {'?': 32767}] expr = 'valid(hobbies)' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and (32766 in row[5:9] or 32767 in row[5:9]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} in row['hobbies'] or \ {'?': 32767} in row['hobbies'] expr = 'not valid(hobbies)' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and (32766 not in row[5:9] and 32767 not in row[5:9]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} not in row['hobbies'] and \ {'?': 32767} not in row['hobbies'] # 1.7 Exclusion filter that refers to a subvariable by alias. expr = 'hobbies_1 == 4' dataset.exclude(expr) df = pandaslib.dataframe(dataset) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[5] != 4 ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'][0] != 4 # 1.8 Complex exclusion filters (multiple rules) expr = ( '(religion != 1 and (not valid(speak_spanish) or speak_spanish >= 1)) ' 'or (religion == 1 and speak_spanish == 2) ' 'or (religion == 3 and speak_spanish == 4)' ) dataset.exclude(expr) # 1.9 Clear the exclusion filter. dataset.exclude() df = pandaslib.dataframe(dataset) assert len(df) == len(ROWS) - 1 # excluding the header # 2. Integration Tests for "Transformations". categories = [ {'id': 1, 'name': 'Nerds', 'numeric_value': 1, 'missing': False}, {'id': 2, 'name': 'Normal Users', 'numeric_value': 2, 'missing': False}, {'id': 3, 'name': 'Hipsters', 'numeric_value': 3, 'missing': False}, {'id': 32767, 'name': 'Unknown', 'numeric_value': None, 'missing': True} ] rules = [ 'operating_system in ("Linux", "Solaris", "Minix", "FreeBSD", "NetBSD")', 'operating_system == "Windows"', 'operating_system == "MacOS"', 'missing(operating_system)' ] new_var = dataset.create_categorical( categories=categories, rules=rules, name='Operating System Users', alias='operating_system_users', description='Type of Operating System Users' ) assert isinstance(new_var, pycrunch.shoji.Entity) new_var.refresh() assert new_var.body.type == 'categorical' # Check the data on the new variable. df = pandaslib.dataframe(dataset) assert 'operating_system_users' in df # Check the nerds. assert set( r['operating_system'] for _, r in df[df['operating_system_users'] == 'Nerds'].iterrows() ) == {'Linux', 'Solaris', 'Minix', 'FreeBSD', 'NetBSD'} # Check the hipsters. assert set( r['operating_system'] for _, r in df[df['operating_system_users'] == 'Hipsters'].iterrows() ) == {'MacOS'} # Check normal users. assert set( r['operating_system'] for _, r in df[df['operating_system_users'] == 'Normal Users'].iterrows() ) == {'Windows'} # 3. Integration Tests for "Recodes". # 3.1 combine_categories. # On a 'categorical' variable. cat_map = { 1: { 'name': 'Bilingual', 'missing': False, 'combined_ids': [2, 3] }, 2: { 'name': 'Not Bilingual', 'missing': False, 'combined_ids': [1, 4] }, 99: { 'name': 'Unknown', 'missing': True, 'combined_ids': [32766, 32767] } } new_var = dataset.combine_categories( 'speak_spanish', cat_map, 'Bilingual Person', 'bilingual' ) assert isinstance(new_var, pycrunch.shoji.Entity) new_var.refresh() assert new_var.body.type == 'categorical' df = pandaslib.dataframe(dataset) assert 'bilingual' in df # Check the data in the recoded variable. bilingual_ids = set( row[0] for row in ROWS if row[0] != 'identity' and row[4] in (2, 3) ) assert set( int(r['identity']) for _, r in df[df['bilingual'] == 'Bilingual'].iterrows() ) == bilingual_ids non_bilingual_ids = set( row[0] for row in ROWS if row[0] != 'identity' and row[4] in (1, 4) ) assert set( int(r['identity']) for _, r in df[df['bilingual'] == 'Not Bilingual'].iterrows() ) == non_bilingual_ids bilingual_null_ids = set( row[0] for row in ROWS if row[0] != 'identity' and row[4] in (32766, 32767) ) assert set( int(r['identity']) for _, r in df[df['bilingual'].isnull()].iterrows() ) == bilingual_null_ids # On a 'categorical_array' variable. cat_map = { 1: { 'name': 'Interested', 'missing': False, 'combined_ids': [1, 2] }, 2: { 'name': 'Not interested', 'missing': False, 'combined_ids': [3, 4] }, 99: { 'name': 'Unknown', 'missing': True, 'combined_ids': [32766, 32767] } } new_var = dataset.combine_categories( 'hobbies', cat_map, 'Hobbies (recoded)', 'hobbies_recoded' ) assert isinstance(new_var, pycrunch.shoji.Entity) new_var.refresh() assert new_var.body.type == 'categorical_array' df = pandaslib.dataframe(dataset) assert 'hobbies_recoded' in df # Check the data in the recoded variable. for _, row in df.iterrows(): hobbies = row['hobbies'] hobbies_rec = row['hobbies_recoded'] assert len(hobbies) == len(hobbies_rec) for i, value in enumerate(hobbies): if value in ({'?': 32766}, {'?': 32767}): assert hobbies_rec[i] == {'?': 99} elif value in (1, 2): assert hobbies_rec[i] == 1 elif value in (3, 4): assert hobbies_rec[i] == 2 # 3.2 combine_responses. response_map = { 'music_recoded_1': ['music_1', 'music_2'], 'music_recoded_2': ['music_97'], 'music_recoded_3': ['music_98', 'music_99'] } new_var = dataset.combine_responses( 'music', response_map, 'Music (alt)', 'music_recoded' ) assert isinstance(new_var, pycrunch.shoji.Entity) new_var.refresh() assert new_var.body.type == 'multiple_response' df = pandaslib.dataframe(dataset) assert 'music_recoded' in df # TODO: Test the data in the recoded variable. Unsure of its meaning. finally: dataset.delete()