def setUp(self): data = { 'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'num_2': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] } metadata = MetaData() metadata.define_numerical_columns(['num_1', 'num_2']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): data = { 'cat_1': ['one', 'two', 'three', 'two', 'one'], } metadata = MetaData() metadata.define_categorical_columns(['cat_1']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): self._data = { 'numerical_1': [1, 2], 'numerical_2': [3, 4], 'numerical_3': [3, 4], 'categorical_1': [7, 8], 'categorical_2': [5, 6], 'categorical_3': [5, 6], 'unknown_1': [9, 10] } self._dataframe = pd.DataFrame(data=self._data) self.meta_data = MetaData(self._dataframe)
def setUp(self): data = { 'categorical': [ 'cat', 'dog', 'cat', 'cat', 'dog', 'bird', 'cat', 'cat', 'dog', 'bird', 'cat', 'dog' ], 'column': [12, 45, 23, 78, 4, 34, 1, 3, 89, 0, 1, 56], } metadata = MetaData() metadata.define_categorical_columns(['categorical']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): data = { 'text_1': [ 'i am one', 'i am I', 'we are two', 'the three of us', 'two become one', 'we are legion', '2 become 1', 'we are foo bar' ], } metadata = MetaData() metadata.define_text_columns(['text_1']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): data = { 'test': [['is', 'this', 'a', 'stemmable', 'sentence'], ['cats', 'are', 'smarter', 'than', 'dogs']] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): data = { 'test': [['this', 'sentence', 'has', 'multiple', 'stopwords'], ['this', 'sentence', 'one', 'multiple', 'too'], ['verb', 'noun'], ['too', 'than', 'can']] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): data = { 'text': [ 'This is, some text.', 'Is this, (some) text!?', 'Would you like: ham, spam and eggs; spam, ham and eggs or eggs, ham and spam?' ] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): data = { 'text': [ 'This is some text.', 'Get some text ASAP?', 'This is some text for John and Joan' ] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): self.factory = AIFactory.AIFactory(builders=[], project_name='test', log_dir='test/dir') metadata = MetaData() metadata.define_numerical_columns( ['feature_2', 'feature_3', 'target_1']) metadata.define_categorical_columns(['feature_1']) self.builder_A = mock.Mock(spec=Builder) self.builder_A.dependent_on = ['B'] self.builder_A.builder_type = 'A' self.builder_A.describe = mock.Mock() self.builder_A.describe.return_value = {'A': 1} self.builder_A.build = mock.Mock() self.builder_B = mock.Mock(spec=Builder) self.builder_B.dependent_on = ['C'] self.builder_B.builder_type = 'B' self.builder_B.describe = mock.Mock() self.builder_B.describe.return_value = {'B': 1} self.builder_B.build = mock.Mock() self.builder_C = mock.Mock(spec=Builder) self.builder_C.dependent_on = [] self.builder_C.builder_type = 'C' self.builder_C.describe = mock.Mock() self.builder_C.describe.return_value = {'C': 1} self.builder_C.build = mock.Mock() self.artie = self.factory.create_AI( [self.builder_A, self.builder_B, self.builder_C])
def setUp(self): data = { 'text': [ '<p>this is some text</p>', '<h1 id="bla", class="blabla", style="transform: translatyeY(-50%)">this is some more text</h1>', '<p>and even <b>more</b> text, damn</p>', 'this is my text (dont remove this)', "this is some text, $('span#TrackingJobBody a').each(function (i, v) { if ($(v).attr('href')) { var href = $(v).attr('href').toLowerCase(); if (href.match(\"^http\")) { switch (true) { case /facebook/.test(href): $(v).attr('mns_rt', 'NonJob-Facebook'); break; case /linkedin/.test(href): $(v).attr('mns_rt', 'NonJob-Linkedin'); break; case /twitter\.com/.test(href): $(v).attr('mns_rt', 'NonJob-Twitter'); break; case /plus\.google\.com/.test(href): $(v).attr('mns_rt', 'NonJob-GooglePlus'); break; case /youtube/.test(href): $(v).attr('data-track', 'Client-Social-Youtube'); break; case /http[s]?\:\/\/([a-z0-9\-\.]{1,}\.[a-z]{2,})[\/]?$/.test(href): $(v).attr('data-track', 'Client-Link-Homepage'); break; default: $(v).attr('mns_rt', 'jobcustomapplyonline'); break; } } } });" ] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): data = { 'target_column': [ ['Job', 'in', 'Dixon', 'with', 'successful', 'business'], ['Engineer', 'Quality', 'in', 'Dixon'], [ 'Shift', 'Supervisor', 'Part', 'time', 'job', 'in', 'Camphill' ], ['Construction', 'PM', 'Job', 'in', 'Dixon'], [ 'CyberCoders', 'Application', 'Principal', 'QA', 'Engineer', 'Java' ], ], 'column': [12, 45, 23, 78, 4], } metadata = MetaData() metadata.define_list_columns(['target_column']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata self.blacklist = ['job', 'in', 'Dixon']
def setUp(self): data = { 'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'mh_1': [ 'EUR,USD', 'USD,JPY,AUD', 'EUR', 'EUR,GBP,AUD', 'USD', 'EUR,JPY', 'EUR,GBP', 'USD,JPY', 'EUR,GBP', 'USD' ], } metadata = MetaData() metadata.define_numerical_columns(['num_1']) metadata.define_multiple_cat_columns(['mh_1']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): data = { 'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'cat_1': [ 'EUR', 'USD', 'EUR', 'EUR', 'USD', 'EUR', 'EUR', 'USD', 'EUR', 'USD' ], } metadata = MetaData() metadata.define_numerical_columns(['num_1']) metadata.define_categorical_columns(['cat_1']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def testScrubbing(self): data = { 'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'list_1': [['EUR', 'USD'], ['USD', 'JPY', 'AUD'], ['EUR'], ['EUR', 'GBP', 'AUD'], ['USD'], ['EUR', 'JPY'], ['EUR', 'GBP'], ['USD', 'JPY'], ['EUR', 'GBP'], ['USD']], } metadata = MetaData() metadata.define_numerical_columns(['num_1']) metadata.define_multiple_cat_columns(['list_1']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata scrubber = MultipleCatListToMultipleHotScrubber(col_name='list_1') scrubber.validate(self.data_model) scrubber.scrub(self.data_model) new_df = self.data_model.get_dataframe() columns = list(new_df.columns.values) # test new columns self.assertEqual(len(columns), 7) self.assertIn('list_1_EUR', columns) self.assertIn('list_1_GBP', columns) self.assertIn('list_1_USD', columns) self.assertIn('list_1_JPY', columns) self.assertIn('list_1_AUD', columns) # check column contents has_EUR_series = new_df['list_1_EUR'] self.assertEqual(list(has_EUR_series.to_dict().values()), [1, 0, 1, 1, 0, 1, 1, 0, 1, 0]) # test metadata meta_data_categorical_cols = self.data_model.metadata.binary_columns self.assertEqual(len(meta_data_categorical_cols), 5) self.assertIn('list_1_EUR', columns) self.assertIn('list_1_GBP', columns) self.assertIn('list_1_USD', columns) self.assertIn('list_1_JPY', columns) self.assertIn('list_1_AUD', columns)
class TestMetaData(unittest.TestCase): def setUp(self): self._data = { 'numerical_1': [1, 2], 'numerical_2': [3, 4], 'numerical_3': [3, 4], 'categorical_1': [7, 8], 'categorical_2': [5, 6], 'categorical_3': [5, 6], 'unknown_1': [9, 10] } self._dataframe = pd.DataFrame(data=self._data) self.meta_data = MetaData(self._dataframe) def test_categorize_columns(self): # categorize all columns self.meta_data.define_categorical_columns( ['categorical_1', 'categorical_2']) self.meta_data.define_numerical_columns(['numerical_1', 'numerical_2']) self.meta_data.define_categorical_columns(['categorical_3']) self.meta_data.define_numerical_columns(['numerical_3']) # scramble the columns self.meta_data.define_categorical_columns( ['numerical_1', 'numerical_2']) self.meta_data.define_numerical_columns( ['categorical_1', 'categorical_2', 'unknown_1']) # categorize the columns again self.meta_data.define_categorical_columns( ['categorical_1', 'categorical_2']) self.meta_data.define_numerical_columns(['numerical_1', 'numerical_2']) self.meta_data.define_categorical_columns(['categorical_3']) self.meta_data.define_numerical_columns(['numerical_3']) self.meta_data.define_unknown_columns(['unknown_1']) self.assertListEqual( ['categorical_1', 'categorical_2', 'categorical_3'], self.meta_data.categorical_columns) self.assertListEqual(['numerical_1', 'numerical_2', 'numerical_3'], self.meta_data.numerical_columns) self.assertListEqual(['unknown_1'], self.meta_data.uncategorized_columns) def test_unknown_is_default(self): self.assertEqual(self.meta_data.get_column_type('categorical_2'), 'unknown') def test_to_string(self): self.meta_data.define_categorical_columns( ['categorical_1', 'categorical_2', 'categorical_3']) self.meta_data.define_numerical_columns( ['numerical_1', 'numerical_2', 'numerical_3']) expected_string = '\nmetadata:\ncategorical_1: ' + MetaData.CATEGORICAL_DATA_TYPE + '\ncategorical_2: ' + MetaData.CATEGORICAL_DATA_TYPE + '\ncategorical_3: ' + MetaData.CATEGORICAL_DATA_TYPE + '\nnumerical_1: ' + MetaData.NUMERICAL_DATA_TYPE + '\nnumerical_2: ' + MetaData.NUMERICAL_DATA_TYPE + '\nnumerical_3: ' + MetaData.NUMERICAL_DATA_TYPE + '\nunknown_1: ' + MetaData.UNKNOWN_DATA_TYPE stringified_metadata = str(self.meta_data) self.assertEqual( expected_string, stringified_metadata, 'Metadata __str__ function does not meet expectations.')