def test_build(self): builder = RandomDataSplitter(evaluation_data_perc=20, data_source=RandomDataSplitter.TRAINING_DATA) arti = mock.Mock('AIBuilder.AbstractAI') # mock training model data = {'col1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 0], 'col2': [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]} dataframe = pd.DataFrame(data=data) training_model = DataModel(dataframe) arti.get_training_data = mock.Mock() arti.get_training_data.return_value = training_model arti.set_training_data = mock.Mock() # mock evaluation model data = {'col1': [11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'col2': [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]} dataframe = pd.DataFrame(data=data) evaluation_model = DataModel(dataframe) arti.get_evaluation_data = mock.Mock() arti.get_evaluation_data.return_value = evaluation_model arti.set_evaluation_data = mock.Mock() builder.build(ml_model=arti) arti.set_evaluation_data.assert_called_once() arti.set_training_data.assert_called_once() split_evaluation_data = arti.set_evaluation_data.call_args[0][0].get_dataframe() split_training_data = arti.set_training_data.call_args[0][0].get_dataframe() self.assertEqual(2, len(split_evaluation_data)) self.assertEqual(8, len(split_training_data))
class TestDataSetSplitter(unittest.TestCase): def setUp(self): self._data = { 'target': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'feature_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'feature_2': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] } self._dataframe = pd.DataFrame(data=self._data) self._data_model = DataModel(self._dataframe) self._data_model.set_tf_feature_columns([ tf.feature_column.numeric_column('feature_1'), tf.feature_column.numeric_column('feature_2') ]) self._data_model.set_target_column('target') def test_split_data(self): splitter = DataSetSplitter(self._data_model) evaluation_data, train_data = splitter.split_by_ratio(ratios=[20, 80]) train_features = train_data.get_feature_columns() train_target = train_data.get_target_column() eval_features = evaluation_data.get_feature_columns() eval_target = evaluation_data.get_target_column() self.assertEqual(len(train_target), 8) self.assertEqual(len(train_features), 8) self.assertEqual(len(eval_target), 2) self.assertEqual(len(eval_features), 2)
class TestDataset(unittest.TestCase): def setUp(self): self._data = {'col1': [1, 2], 'col2': [3, 4], 'col4': [5, 6]} self._dataframe = pd.DataFrame(data=self._data) self._dataset = DataModel(self._dataframe) def test_validate_columns_invalid(self): with self.assertRaises(RuntimeError): self._dataset.validate_columns(['col3']) def test_validate_columns(self): self._dataset.validate_columns(['col1']) def test_feature_columns(self): intended_columns = ['col1', 'col2'] self._dataset.set_feature_columns(intended_columns) feature_columns = self._dataset.get_feature_columns() result_columns = list(feature_columns.columns.values) self.assertEqual(result_columns, intended_columns) def test_target_column(self): intended_column = 'col1' self._dataset.set_target_column(intended_column) target_column = self._dataset.get_target_column() self.assertEqual(target_column.tolist(), self._data[intended_column])
def render_tf_feature_columns(self, data_model: DataModel): data_model.set_tf_feature_columns([]) for feature_column_info in self.feature_columns(): column_strategy = FeatureColumnStrategyFactory.get_strategy( feature_column_info['name'], feature_column_info['type'], data_model, self.feature_config()) feature_columns = column_strategy.build() for tf_feature_column in feature_columns: data_model.add_tf_feature_columns(tf_feature_column)
def setUp(self): self._data = { 'date': [ '2017-03-26T05:04:46.539Z', '2017-12-01T23:04:46.539Z', '2017-02-08T07:38:48.129Z' ], } self._dataframe = pd.DataFrame(data=self._data) self.data_model = DataModel(self._dataframe)
def setUp(self): data = { 'test': [['is', 'this', 'a', 'stemmable', 'sentence'], ['cats', 'are', 'smarter', 'than', 'dogs']] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): data = { 'test': [['this', 'sentence', 'has', 'multiple', 'stopwords'], ['this', 'sentence', 'one', 'multiple', 'too'], ['verb', 'noun'], ['too', 'than', 'can']] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): self._data = { 'column_1': [0, 2, 3], 'column_2': [3, 2, 1], 'column_3': [3, 2, 2], } self._dataframe = pd.DataFrame(data=self._data) self._data_model = DataModel(self._dataframe) self._data_model.metadata.define_numerical_columns( ['column_1', 'column_2', 'column_3'])
def setUp(self): data = { 'text': [ 'This is, some text.', 'Is this, (some) text!?', 'Would you like: ham, spam and eggs; spam, ham and eggs or eggs, ham and spam?' ] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def assign_fn(self, data_model: DataModel, fn_name: str, kwargs: dict): if hasattr(self.fn_holder, fn_name): return self.load_from_holder(data_model, fn_name, kwargs) if fn_name == self.PANDAS_FN: kwargs['x'] = data_model.get_input_fn_x_data() kwargs['y'] = data_model.get_target_column() kwargs['target_column'] = data_model.target_column_name fn = getattr(tf.estimator.inputs, 'pandas_input_fn') return fn(**kwargs)
class StopWordScrubberTest(unittest.TestCase): def setUp(self): data = { 'test': [['this', 'sentence', 'has', 'multiple', 'stopwords'], ['this', 'sentence', 'one', 'multiple', 'too'], ['verb', 'noun'], ['too', 'than', 'can']] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata def test_scrubbing_new_column(self): scrubber = StopWordScrubber(column='test', new_column='test2') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for row in df['test2']: self.assertFalse('this' in row) self.assertFalse('has' in row) self.assertFalse('too' in row) self.assertFalse('than' in row) self.assertFalse('can' in row) self.assertEqual(5, len(df['test'][0])) self.assertEqual(5, len(df['test'][1])) self.assertEqual(2, len(df['test'][2])) self.assertEqual(3, len(df['test'][3])) def test_scrubbing(self): scrubber = StopWordScrubber(column='test') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for row in df['test']: self.assertFalse('this' in row) self.assertFalse('has' in row) self.assertFalse('too' in row) self.assertFalse('than' in row) self.assertFalse('can' in row) def test_verbose_scrubbing(self): scrubber = StopWordScrubber(column='test', verbosity=1) scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for row in df['test']: self.assertFalse('this' in row) self.assertFalse('has' in row) self.assertFalse('too' in row) self.assertFalse('than' in row) self.assertFalse('can' in row)
def setUp(self): self._data = { 'numerical_1': [2, None, 3, 4], 'numerical_3': [None, 2, 3, 4], 'categorical_1': ['one', None, 'two', 'three'], 'categorical_2': ['apple', 'pie', None, 'three'], 'categorical_3': ['apple', 'pie', None, 'three'], 'unknown_1': [9, 10, 11, 12] } self._dataframe = pd.DataFrame(data=self._data) self._data_model = DataModel(self._dataframe)
def setUp(self): data = { 'text': [ 'This is some text.', 'Get some text ASAP?', 'This is some text for John and Joan' ] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): date_1 = datetime.strptime('01-01-2018', '%d-%m-%Y') date_2 = datetime.strptime('01-01-2017', '%d-%m-%Y') date_3 = datetime.now() self._data = { 'value': [0, 2, 100], 'currency': ['EUR', 'USD', 'EUR'], 'date': [date_3, date_1, date_2], } self._dataframe = pd.DataFrame(data=self._data) self._data_model = DataModel(self._dataframe)
class testHTMLScrubber(unittest.TestCase): def setUp(self): data = { 'text': [ '<p>this is some text</p>', '<h1 id="bla", class="blabla", style="transform: translatyeY(-50%)">this is some more text</h1>', '<p>and even <b>more</b> text, damn</p>', 'this is my text (dont remove this)', "this is some text, $('span#TrackingJobBody a').each(function (i, v) { if ($(v).attr('href')) { var href = $(v).attr('href').toLowerCase(); if (href.match(\"^http\")) { switch (true) { case /facebook/.test(href): $(v).attr('mns_rt', 'NonJob-Facebook'); break; case /linkedin/.test(href): $(v).attr('mns_rt', 'NonJob-Linkedin'); break; case /twitter\.com/.test(href): $(v).attr('mns_rt', 'NonJob-Twitter'); break; case /plus\.google\.com/.test(href): $(v).attr('mns_rt', 'NonJob-GooglePlus'); break; case /youtube/.test(href): $(v).attr('data-track', 'Client-Social-Youtube'); break; case /http[s]?\:\/\/([a-z0-9\-\.]{1,}\.[a-z]{2,})[\/]?$/.test(href): $(v).attr('data-track', 'Client-Link-Homepage'); break; default: $(v).attr('mns_rt', 'jobcustomapplyonline'); break; } } } });" ] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata def testHTMLRemoval(self): scrubber = CodeScrubber('text') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['text']: self.validate_string(text) def validate_string(self, text): print(text) self.assertFalse('>' in text) self.assertFalse('<' in text) self.assertFalse('{' in text) self.assertFalse('bla' in text) self.assertFalse('transform' in text) self.assertFalse('50%' in text) self.assertTrue('dont remove this' in text) def testHTMLRemoval_verbose(self): scrubber = CodeScrubber('text', verbosity=1) scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['text']: self.validate_string(text) def testHTMLRemoval_new_col(self): scrubber = CodeScrubber(text_column='text', new_text_column='new') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['new']: self.validate_string(text)
def setUp(self): data = { 'text': [ '<p>this is some text</p>', '<h1 id="bla", class="blabla", style="transform: translatyeY(-50%)">this is some more text</h1>', '<p>and even <b>more</b> text, damn</p>', 'this is my text (dont remove this)', "this is some text, $('span#TrackingJobBody a').each(function (i, v) { if ($(v).attr('href')) { var href = $(v).attr('href').toLowerCase(); if (href.match(\"^http\")) { switch (true) { case /facebook/.test(href): $(v).attr('mns_rt', 'NonJob-Facebook'); break; case /linkedin/.test(href): $(v).attr('mns_rt', 'NonJob-Linkedin'); break; case /twitter\.com/.test(href): $(v).attr('mns_rt', 'NonJob-Twitter'); break; case /plus\.google\.com/.test(href): $(v).attr('mns_rt', 'NonJob-GooglePlus'); break; case /youtube/.test(href): $(v).attr('data-track', 'Client-Social-Youtube'); break; case /http[s]?\:\/\/([a-z0-9\-\.]{1,}\.[a-z]{2,})[\/]?$/.test(href): $(v).attr('data-track', 'Client-Link-Homepage'); break; default: $(v).attr('mns_rt', 'jobcustomapplyonline'); break; } } } });" ] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): self._data = { 'target': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'feature_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'feature_2': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] } self._dataframe = pd.DataFrame(data=self._data) self._data_model = DataModel(self._dataframe) self._data_model.set_tf_feature_columns([ tf.feature_column.numeric_column('feature_1'), tf.feature_column.numeric_column('feature_2') ]) self._data_model.set_target_column('target')
class testPunctuationScrubber(unittest.TestCase): def setUp(self): data = { 'text': [ 'This is, some text.', 'Is this, (some) text!?', 'Would you like: ham, spam and eggs; spam, ham and eggs or eggs, ham and spam?' ] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata def test_punc_removal(self): scrubber = PunctuationScrubber('text') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['text']: self.validate_string(text) def validate_string(self, text): self.assertFalse(',' in text) self.assertFalse('.' in text) self.assertFalse(';' in text) self.assertFalse(':' in text) self.assertFalse('!' in text) self.assertFalse('?' in text) self.assertFalse('(' in text) self.assertFalse(')' in text) def test_punc_removal_verbose(self): scrubber = PunctuationScrubber('text', verbosity=1) scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['text']: self.validate_string(text) def test_punc_removal_new_col(self): scrubber = PunctuationScrubber(text_column='text', new_text_column='new') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() for text in df['new']: self.validate_string(text)
def setUp(self): data = { 'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'mh_1': [ 'EUR,USD', 'USD,JPY,AUD', 'EUR', 'EUR,GBP,AUD', 'USD', 'EUR,JPY', 'EUR,GBP', 'USD,JPY', 'EUR,GBP', 'USD' ], } metadata = MetaData() metadata.define_numerical_columns(['num_1']) metadata.define_multiple_cat_columns(['mh_1']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def setUp(self): data = { 'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'num_2': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'cat_1': [ 'EUR', 'USD', 'EUR', 'EUR', 'USD', 'EUR', 'EUR', 'USD', 'EUR', 'USD' ], } metadata = MetaData() metadata.define_numerical_columns(['num_1', 'num_2']) metadata.define_categorical_columns(['cat_1']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata
def base_fn(data_model: Data.DataModel, batch_size=1, epoch=1): """ input function one, made for shoes AI. :param data_model: Data.MLDataset :param epoch: int :param batch_size: int :return: """ features = _dataset_to_dict(features=data_model.get_feature_columns()) data_set = tf.data.Dataset.from_tensor_slices( (features, data_model.get_target_column())) data_set = data_set.shuffle(100).repeat(epoch).batch(batch_size) return data_set.make_one_shot_iterator().get_next()
def balance_data(self, data_model: DataModel, target_column_name: str) -> DataModel: weights_by_label = self.map_weights_by_cat_label( data_model, target_column_name) df = data_model.get_dataframe() weights = self.get_weights_list(df, target_column_name, weights_by_label) weights_column_warning = 'note: adding weights column ({}), make sure it is passed to the estimator- and ' \ 'data builder!'.format(WEIGHTS_COLUMN) warnings.warn(weights_column_warning) df[WEIGHTS_COLUMN] = weights data_model.set_dataframe(df) return data_model
def test_no_categories(self): builder = CategoricalDataSplitter(data_source='training', column_name='col2') training_model = DataModel(self.dataframe) self.arti.set_training_data(training_model) with self.assertRaises(AssertionError): self.arti = builder.build(ml_model=self.arti)
def setUp(self) -> None: self.builder = FeatureColumnBuilder( feature_columns={ 'col1': FeatureColumnStrategy.CATEGORICAL_COLUMN_VOC_LIST, 'col3': FeatureColumnStrategy.NUMERICAL_COLUMN, 'col4': FeatureColumnStrategy.INDICATOR_COLUMN_VOC_LIST, 'col5': FeatureColumnStrategy.BUCKETIZED_COLUMN, }, feature_config={'col5': {'buckets': 2}} ) self.arti = mock.Mock('AIBuilder.AbstractAI') # mock training model data = {'col3': [1, 2, 3, 4, 5, 6, 7, 8, 9, 0], 'col2': [1, 2, 3, 4, 5, 6, 7, 8, 9, 0], 'col1': ['cat_one', 'cat_one', 'cat_one', 'cat_one', 'cat_one', 'cat_two', 'cat_one', 'cat_two', 'cat_one', 'cat_two'], 'col4': [ ['cat_four', 'cat_three', 'cat_four'], ['cat_two', 'cat_three', 'cat_four'], ['cat_two', 'cat_three', 'cat_four'], ['cat_two', 'cat_four'], ['cat_one', 'cat_three', 'cat_four'], ['cat_one', 'cat_three', 'cat_four'], ['cat_one', 'cat_three'], ['cat_one', 'cat_three', 'cat_four'], ['cat_one', 'cat_two', 'cat_three'], ['cat_two', 'cat_three'], ], 'col5': [1, 2, 3, 4, 1, 2, 3, 4, 3, 4]} dataframe = pd.DataFrame(data=data) self.training_model = DataModel(dataframe) self.training_model.get_target_column = mock.Mock() self.training_model.get_target_column.return_value = 'col2' self.arti.get_training_data = mock.Mock() self.arti.get_training_data.return_value = self.training_model self.arti.set_training_data = mock.Mock() self.arti.get_evaluation_data = mock.Mock() self.arti.get_evaluation_data.return_value = None self.arti.get_prediction_data = mock.Mock() self.arti.get_prediction_data.return_value = None
def testScrubbing(self): data = { 'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'list_1': [['EUR', 'USD'], ['USD', 'JPY', 'AUD'], ['EUR'], ['EUR', 'GBP', 'AUD'], ['USD'], ['EUR', 'JPY'], ['EUR', 'GBP'], ['USD', 'JPY'], ['EUR', 'GBP'], ['USD']], } metadata = MetaData() metadata.define_numerical_columns(['num_1']) metadata.define_multiple_cat_columns(['list_1']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata scrubber = MultipleCatListToMultipleHotScrubber(col_name='list_1') scrubber.validate(self.data_model) scrubber.scrub(self.data_model) new_df = self.data_model.get_dataframe() columns = list(new_df.columns.values) # test new columns self.assertEqual(len(columns), 7) self.assertIn('list_1_EUR', columns) self.assertIn('list_1_GBP', columns) self.assertIn('list_1_USD', columns) self.assertIn('list_1_JPY', columns) self.assertIn('list_1_AUD', columns) # check column contents has_EUR_series = new_df['list_1_EUR'] self.assertEqual(list(has_EUR_series.to_dict().values()), [1, 0, 1, 1, 0, 1, 1, 0, 1, 0]) # test metadata meta_data_categorical_cols = self.data_model.metadata.binary_columns self.assertEqual(len(meta_data_categorical_cols), 5) self.assertIn('list_1_EUR', columns) self.assertIn('list_1_GBP', columns) self.assertIn('list_1_USD', columns) self.assertIn('list_1_JPY', columns) self.assertIn('list_1_AUD', columns)
class WordStemmerTest(unittest.TestCase): def setUp(self): data = { 'test': [['is', 'this', 'a', 'stemmable', 'sentence'], ['cats', 'are', 'smarter', 'than', 'dogs']] } metadata = MetaData() self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata def test_scrubbing(self): scrubber = WordStemmer(column='test') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() self.assertEqual(df['test'][0], ['is', 'this', 'a', 'stemmabl', 'sentenc']) self.assertEqual(df['test'][1], ['cat', 'are', 'smarter', 'than', 'dog']) def test_scrubbing_verbose(self): scrubber = WordStemmer(column='test', verbosity=1) scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() self.assertEqual(df['test'][0], ['is', 'this', 'a', 'stemmabl', 'sentenc']) self.assertEqual(df['test'][1], ['cat', 'are', 'smarter', 'than', 'dog']) def test_scrubbing_new_col(self): scrubber = WordStemmer(column='test', new_column='new') scrubber.scrub(self.data_model) df = self.data_model.get_dataframe() self.assertEqual(df['new'][0], ['is', 'this', 'a', 'stemmabl', 'sentenc']) self.assertEqual(df['new'][1], ['cat', 'are', 'smarter', 'than', 'dog']) self.assertEqual(df['test'][0], ['is', 'this', 'a', 'stemmable', 'sentence']) self.assertEqual(df['test'][1], ['cats', 'are', 'smarter', 'than', 'dogs'])
def setUp(self): date_1 = datetime.strptime('01-01-2018', '%d-%m-%Y') date_2 = datetime.strptime('01-01-2017', '%d-%m-%Y') date_3 = datetime.now() self._data = { 'value_1': [0, 1, 50], 'value_2': [0, 3, 150], 'currency': ['EUR', 'USD', 'EUR'], 'date': [date_3, date_1, date_2], } self._df = pd.DataFrame(self._data) self.data_model = DataModel(self._df) self.data_model.metadata.define_numerical_columns( ['value_1', 'value_2']) self.data_model.metadata.define_categorical_columns(['currency'])
def separate_by_target_categories(data_model: DataModel, target_column_name): df = data_model.get_dataframe() categories = df[target_column_name].unique() stack_one = df.loc[df[target_column_name] == categories[0]] stack_two = df.loc[df[target_column_name] == categories[1]] return stack_one, stack_two
def balance_data(self, data_model: DataModel, target_column_name: str) -> DataModel: long_stack, short_stack = self.prepare_data( data_model=data_model, target_column_name=target_column_name) length_to_have = len(long_stack) duplicate_short_stack = short_stack.copy() while len(short_stack) < length_to_have: short_stack = pd.concat([short_stack, duplicate_short_stack]) short_stack = self.cut_df_to_length(short_stack, length_to_have) self.validate_result(long_stack, short_stack) new_df = self.merge_stacks(long_stack, short_stack) data_model.set_dataframe(new_df) return data_model
def setUp(self): data = { 'num_1': [0, 1, 2, 3, 4, 5, 6, 7, 8, 50], 'num_2': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] } metadata = MetaData() metadata.define_numerical_columns(['num_1', 'num_2']) self.df = pd.DataFrame(data) self.data_model = DataModel(self.df) self.data_model.metadata = metadata