def test_get_tfidf(fn): def sort_ndarray(array): hashes = [hash(str(x)) for x in array] sotred_indicies = sorted(range(len(hashes)), key=lambda k: hashes[k]) return array[sotred_indicies] fn_inputs = { "sentiment_words": pd.Series(["one", "last", "second"]), "docs": [ "this is a document", "this document is the second document", "last one", ], } fn_correct_outputs = OrderedDict([( "tfidf", np.array([[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.70710678, 0.70710678]]), )]) fn_out = fn(**fn_inputs) assert_structure(fn_out, fn_correct_outputs["tfidf"], "tfidf") assert np.isclose(sort_ndarray(fn_out.T), sort_ndarray(fn_correct_outputs["tfidf"].T)).all(), ( "Wrong value for tfidf.\n" "INPUT docs:\n{}\n\n" "OUTPUT tfidf:\n{}\n\n" "A POSSIBLE CORRECT OUTPUT FOR tfidf:\n{}\n".format( fn_inputs["docs"], fn_out, fn_correct_outputs["tfidf"]))
def test_get_bag_of_words(fn): def sort_ndarray(array): hashes = [hash(str(x)) for x in array] sotred_indicies = sorted(range(len(hashes)), key=lambda k: hashes[k]) return array[sotred_indicies] fn_inputs = { 'docs': [ 'this is a document', 'this document is the second document', 'last one' ] } fn_correct_outputs = OrderedDict([('bag_of_words', np.array([[0, 1, 1, 0, 0, 0, 1], [0, 1, 2, 0, 1, 1, 1], [1, 0, 0, 1, 0, 0, 0]]))]) fn_out = fn(**fn_inputs) assert_structure(fn_out, fn_correct_outputs['bag_of_words'], 'bag_of_words') assert np.array_equal(sort_ndarray(fn_out.T), sort_ndarray(fn_correct_outputs['bag_of_words'].T)), \ 'Wrong value for bag_of_words.\n' \ 'INPUT docs:\n{}\n\n' \ 'OUTPUT bag_of_words:\n{}\n\n' \ 'A POSSIBLE CORRECT OUTPUT FOR bag_of_words:\n{}\n'\ .format(fn_inputs['docs'], fn_out, fn_correct_outputs['bag_of_words'])
def test_get_bag_of_words(fn): def sort_ndarray(array): hashes = [hash(str(x)) for x in array] sotred_indicies = sorted(range(len(hashes)), key=lambda k: hashes[k]) return array[sotred_indicies] fn_inputs = { "sentiment_words": pd.Series(["one", "last", "second"]), "docs": [ "this is a document", "this document is the second document", "last one", ], } fn_correct_outputs = OrderedDict([("bag_of_words", np.array([[0, 0, 0], [1, 0, 0], [0, 1, 1]]))]) fn_out = fn(**fn_inputs) assert_structure(fn_out, fn_correct_outputs["bag_of_words"], "bag_of_words") assert np.array_equal(sort_ndarray( fn_out.T), sort_ndarray(fn_correct_outputs["bag_of_words"].T)), ( "Wrong value for bag_of_words.\n" "INPUT docs:\n{}\n\n" "OUTPUT bag_of_words:\n{}\n\n" "A POSSIBLE CORRECT OUTPUT FOR bag_of_words:\n{}\n".format( fn_inputs["docs"], fn_out, fn_correct_outputs["bag_of_words"]))
def test_get_tfidf(fn): def sort_ndarray(array): hashes = [hash(str(x)) for x in array] sotred_indicies = sorted(range(len(hashes)), key=lambda k: hashes[k]) return array[sotred_indicies] fn_inputs = { 'sentiment_words': pd.Series(['one', 'last', 'second']), 'docs': [ 'this is a document', 'this document is the second document', 'last one' ] } fn_correct_outputs = OrderedDict([('tfidf', np.array([[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.70710678, 0.70710678]]))]) fn_out = fn(**fn_inputs) assert_structure(fn_out, fn_correct_outputs['tfidf'], 'tfidf') assert np.isclose(sort_ndarray(fn_out.T), sort_ndarray(fn_correct_outputs['tfidf'].T)).all(), \ 'Wrong value for tfidf.\n' \ 'INPUT docs:\n{}\n\n' \ 'OUTPUT tfidf:\n{}\n\n' \ 'A POSSIBLE CORRECT OUTPUT FOR tfidf:\n{}\n'\ .format(fn_inputs['docs'], fn_out, fn_correct_outputs['tfidf'])
def test_non_overlapping_estimators(fn): n_estimators = 3 columns = ['test column 1', 'test column 2'] dates = generate_random_dates(8) assets = get_assets(3) index = pd.MultiIndex.from_product([dates, assets]) noise = np.random.RandomState(0).random_sample([len(index)]) * len(index) values = np.arange(len(index) * len(columns)).reshape([len(columns), len(index)]).T targets = np.sum(values, axis=-1) + noise classifiers = [ RandomForestRegressor(300, oob_score=True, n_jobs=-1, random_state=101) for _ in range(n_estimators)] fn_inputs = { 'x': pd.DataFrame(values, index, columns), 'y': pd.Series(targets, index), 'classifiers': classifiers, 'n_skip_samples': 3} random_forest_regressor_fit = RandomForestRegressor.fit with patch.object(RandomForestRegressor, 'fit', autospec=True) as mock_fit: mock_fit.side_effect = random_forest_regressor_fit fn_return_value = fn(**fn_inputs) assert_structure(fn_return_value, [RandomForestRegressor for _ in range(n_estimators)], 'PCA') for classifier in fn_return_value: try: classifier.fit.assert_called() except AssertionError: raise Exception('Test Failure: RandomForestRegressor.fit not called on all classifiers')
def test_fit_pca(fn): dates = generate_random_dates(4) assets = get_assets(3) fn_inputs = { 'returns': pd.DataFrame([[0.02769242, 1.34872387, 0.23460972], [-0.94728692, 0.68386883, -1.23987235], [1.93769376, -0.48275934, 0.34957348], [0.23985234, 0.35897345, 0.34598734]], dates, assets), 'num_factor_exposures': 2, 'svd_solver': 'full' } fn_correct_values = { 'PCA': PCA(), 'PCA.components_': np.array([[0.81925896, -0.40427891, 0.40666118], [-0.02011128, 0.68848693, 0.72496985]]) } pca_fit = PCA.fit with patch.object(PCA, 'fit', autospec=True) as mock_fit: mock_fit.side_effect = pca_fit fn_return_value = fn(**fn_inputs) assert_structure(fn_return_value, fn_correct_values['PCA'], 'PCA') try: # print(dir(fn_return_value.fit)) fn_return_value.fit.assert_called() # old python: # fn_return_value.fit.assert_any_call() except AssertionError: raise Exception('Test Failure: PCA.fit not called') try: fn_return_value.fit.assert_called_with(self=fn_return_value, X=fn_inputs['returns']) except Exception: raise Exception( 'Test Failure: PCA.fit called with the wrong arguments') assert_structure(fn_return_value.components_, fn_correct_values['PCA.components_'], 'PCA.components_') if not does_data_match(fn_return_value.components_, fn_correct_values['PCA.components_']): raise Exception('Test Failure: PCA not fitted correctly\n\n' 'PCA.components_:\n' '{}\n\n' 'Expected PCA.components_:\n' '{}'.format(fn_return_value.components_, fn_correct_values['PCA.components_']))
def test_fit_pca(fn): dates = generate_random_dates(4) assets = get_assets(3) fn_inputs = { 'returns': pd.DataFrame( [ [0.02769242, 1.34872387, 0.23460972], [-0.94728692, 0.68386883, -1.23987235], [1.93769376, -0.48275934, 0.34957348], [0.23985234, 0.35897345, 0.34598734]], dates, assets), 'num_factor_exposures': 2, 'svd_solver': 'full'} fn_correct_values = { 'PCA': PCA(), 'PCA.components_': np.array([ [0.81925896, -0.40427891, 0.40666118], [-0.02011128, 0.68848693, 0.72496985]])} pca_fit = PCA.fit with patch.object(PCA, 'fit', autospec=True) as mock_fit: mock_fit.side_effect = pca_fit fn_return_value = fn(**fn_inputs) assert_structure(fn_return_value, fn_correct_values['PCA'], 'PCA') try: fn_return_value.fit.assert_called() except AssertionError: raise Exception('Test Failure: PCA.fit not called') try: fn_return_value.fit.assert_called_with(self=fn_return_value, X=fn_inputs['returns']) except Exception: raise Exception('Test Failure: PCA.fit called with the wrong arguments') assert_structure(fn_return_value.components_, fn_correct_values['PCA.components_'], 'PCA.components_') if not does_data_match(fn_return_value.components_, fn_correct_values['PCA.components_']): raise Exception('Test Failure: PCA not fitted correctly\n\n' 'PCA.components_:\n' '{}\n\n' 'Expected PCA.components_:\n' '{}'.format(fn_return_value.components_, fn_correct_values['PCA.components_']))
def test_fit_pca(fn): dates = generate_random_dates(4) assets = get_assets(3) fn_inputs = { "returns": pd.DataFrame( [ [0.02769242, 1.34872387, 0.23460972], [-0.94728692, 0.68386883, -1.23987235], [1.93769376, -0.48275934, 0.34957348], [0.23985234, 0.35897345, 0.34598734], ], dates, assets, ), "num_factor_exposures": 2, "svd_solver": "full", } fn_correct_values = { "PCA": PCA(), "PCA.components_": np.array([ [0.81925896, -0.40427891, 0.40666118], [-0.02011128, 0.68848693, 0.72496985], ]), } pca_fit = PCA.fit with patch.object(PCA, "fit", autospec=True) as mock_fit: mock_fit.side_effect = pca_fit fn_return_value = fn(**fn_inputs) assert_structure(fn_return_value, fn_correct_values["PCA"], "PCA") try: fn_return_value.fit.assert_called() except AssertionError: raise Exception("Test Failure: PCA.fit not called") try: fn_return_value.fit.assert_called_with(self=fn_return_value, X=fn_inputs["returns"]) except Exception: raise Exception( "Test Failure: PCA.fit called with the wrong arguments") assert_structure( fn_return_value.components_, fn_correct_values["PCA.components_"], "PCA.components_", ) if not does_data_match(fn_return_value.components_, fn_correct_values["PCA.components_"]): raise Exception("Test Failure: PCA not fitted correctly\n\n" "PCA.components_:\n" "{}\n\n" "Expected PCA.components_:\n" "{}".format(fn_return_value.components_, fn_correct_values["PCA.components_"]))