示例#1
0
def test_get_tfidf(fn):
    def sort_ndarray(array):
        hashes = [hash(str(x)) for x in array]
        sotred_indicies = sorted(range(len(hashes)), key=lambda k: hashes[k])

        return array[sotred_indicies]

    fn_inputs = {
        "sentiment_words":
        pd.Series(["one", "last", "second"]),
        "docs": [
            "this is a document",
            "this document is the second document",
            "last one",
        ],
    }
    fn_correct_outputs = OrderedDict([(
        "tfidf",
        np.array([[0.0, 0.0, 0.0], [1.0, 0.0, 0.0],
                  [0.0, 0.70710678, 0.70710678]]),
    )])

    fn_out = fn(**fn_inputs)
    assert_structure(fn_out, fn_correct_outputs["tfidf"], "tfidf")
    assert np.isclose(sort_ndarray(fn_out.T),
                      sort_ndarray(fn_correct_outputs["tfidf"].T)).all(), (
                          "Wrong value for tfidf.\n"
                          "INPUT docs:\n{}\n\n"
                          "OUTPUT tfidf:\n{}\n\n"
                          "A POSSIBLE CORRECT OUTPUT FOR tfidf:\n{}\n".format(
                              fn_inputs["docs"], fn_out,
                              fn_correct_outputs["tfidf"]))
def test_get_bag_of_words(fn):
    def sort_ndarray(array):
        hashes = [hash(str(x)) for x in array]
        sotred_indicies = sorted(range(len(hashes)), key=lambda k: hashes[k])

        return array[sotred_indicies]

    fn_inputs = {
        'docs': [
            'this is a document', 'this document is the second document',
            'last one'
        ]
    }
    fn_correct_outputs = OrderedDict([('bag_of_words',
                                       np.array([[0, 1, 1, 0, 0, 0, 1],
                                                 [0, 1, 2, 0, 1, 1, 1],
                                                 [1, 0, 0, 1, 0, 0, 0]]))])

    fn_out = fn(**fn_inputs)
    assert_structure(fn_out, fn_correct_outputs['bag_of_words'],
                     'bag_of_words')
    assert np.array_equal(sort_ndarray(fn_out.T), sort_ndarray(fn_correct_outputs['bag_of_words'].T)), \
        'Wrong value for bag_of_words.\n' \
        'INPUT docs:\n{}\n\n' \
        'OUTPUT bag_of_words:\n{}\n\n' \
        'A POSSIBLE CORRECT OUTPUT FOR bag_of_words:\n{}\n'\
        .format(fn_inputs['docs'], fn_out, fn_correct_outputs['bag_of_words'])
示例#3
0
def test_get_bag_of_words(fn):
    def sort_ndarray(array):
        hashes = [hash(str(x)) for x in array]
        sotred_indicies = sorted(range(len(hashes)), key=lambda k: hashes[k])

        return array[sotred_indicies]

    fn_inputs = {
        "sentiment_words":
        pd.Series(["one", "last", "second"]),
        "docs": [
            "this is a document",
            "this document is the second document",
            "last one",
        ],
    }
    fn_correct_outputs = OrderedDict([("bag_of_words",
                                       np.array([[0, 0, 0], [1, 0, 0],
                                                 [0, 1, 1]]))])

    fn_out = fn(**fn_inputs)
    assert_structure(fn_out, fn_correct_outputs["bag_of_words"],
                     "bag_of_words")
    assert np.array_equal(sort_ndarray(
        fn_out.T), sort_ndarray(fn_correct_outputs["bag_of_words"].T)), (
            "Wrong value for bag_of_words.\n"
            "INPUT docs:\n{}\n\n"
            "OUTPUT bag_of_words:\n{}\n\n"
            "A POSSIBLE CORRECT OUTPUT FOR bag_of_words:\n{}\n".format(
                fn_inputs["docs"], fn_out, fn_correct_outputs["bag_of_words"]))
def test_get_tfidf(fn):
    def sort_ndarray(array):
        hashes = [hash(str(x)) for x in array]
        sotred_indicies = sorted(range(len(hashes)), key=lambda k: hashes[k])

        return array[sotred_indicies]

    fn_inputs = {
        'sentiment_words':
        pd.Series(['one', 'last', 'second']),
        'docs': [
            'this is a document', 'this document is the second document',
            'last one'
        ]
    }
    fn_correct_outputs = OrderedDict([('tfidf',
                                       np.array([[0.0, 0.0, 0.0],
                                                 [1.0, 0.0, 0.0],
                                                 [0.0, 0.70710678,
                                                  0.70710678]]))])

    fn_out = fn(**fn_inputs)
    assert_structure(fn_out, fn_correct_outputs['tfidf'], 'tfidf')
    assert np.isclose(sort_ndarray(fn_out.T), sort_ndarray(fn_correct_outputs['tfidf'].T)).all(), \
        'Wrong value for tfidf.\n' \
        'INPUT docs:\n{}\n\n' \
        'OUTPUT tfidf:\n{}\n\n' \
        'A POSSIBLE CORRECT OUTPUT FOR tfidf:\n{}\n'\
        .format(fn_inputs['docs'], fn_out, fn_correct_outputs['tfidf'])
def test_non_overlapping_estimators(fn):
    n_estimators = 3
    columns = ['test column 1', 'test column 2']
    dates = generate_random_dates(8)
    assets = get_assets(3)
    index = pd.MultiIndex.from_product([dates, assets])
    noise = np.random.RandomState(0).random_sample([len(index)]) * len(index)
    values = np.arange(len(index) * len(columns)).reshape([len(columns), len(index)]).T
    targets = np.sum(values, axis=-1) + noise

    classifiers = [
        RandomForestRegressor(300, oob_score=True, n_jobs=-1, random_state=101)
        for _ in range(n_estimators)]

    fn_inputs = {
        'x': pd.DataFrame(values, index, columns),
        'y': pd.Series(targets, index),
        'classifiers': classifiers,
        'n_skip_samples': 3}

    random_forest_regressor_fit = RandomForestRegressor.fit
    with patch.object(RandomForestRegressor, 'fit', autospec=True) as mock_fit:
        mock_fit.side_effect = random_forest_regressor_fit
        fn_return_value = fn(**fn_inputs)

        assert_structure(fn_return_value, [RandomForestRegressor for _ in range(n_estimators)], 'PCA')

        for classifier in fn_return_value:
            try:
                classifier.fit.assert_called()
            except AssertionError:
                raise Exception('Test Failure: RandomForestRegressor.fit not called on all classifiers')
def test_fit_pca(fn):
    dates = generate_random_dates(4)
    assets = get_assets(3)

    fn_inputs = {
        'returns':
        pd.DataFrame([[0.02769242, 1.34872387, 0.23460972],
                      [-0.94728692, 0.68386883, -1.23987235],
                      [1.93769376, -0.48275934, 0.34957348],
                      [0.23985234, 0.35897345, 0.34598734]], dates, assets),
        'num_factor_exposures':
        2,
        'svd_solver':
        'full'
    }
    fn_correct_values = {
        'PCA':
        PCA(),
        'PCA.components_':
        np.array([[0.81925896, -0.40427891, 0.40666118],
                  [-0.02011128, 0.68848693, 0.72496985]])
    }

    pca_fit = PCA.fit
    with patch.object(PCA, 'fit', autospec=True) as mock_fit:
        mock_fit.side_effect = pca_fit

        fn_return_value = fn(**fn_inputs)

        assert_structure(fn_return_value, fn_correct_values['PCA'], 'PCA')

        try:
            # print(dir(fn_return_value.fit))
            fn_return_value.fit.assert_called()
            # old python:
            # fn_return_value.fit.assert_any_call()

        except AssertionError:
            raise Exception('Test Failure: PCA.fit not called')

        try:
            fn_return_value.fit.assert_called_with(self=fn_return_value,
                                                   X=fn_inputs['returns'])
        except Exception:
            raise Exception(
                'Test Failure: PCA.fit called with the wrong arguments')

        assert_structure(fn_return_value.components_,
                         fn_correct_values['PCA.components_'],
                         'PCA.components_')

        if not does_data_match(fn_return_value.components_,
                               fn_correct_values['PCA.components_']):
            raise Exception('Test Failure: PCA not fitted correctly\n\n'
                            'PCA.components_:\n'
                            '{}\n\n'
                            'Expected PCA.components_:\n'
                            '{}'.format(fn_return_value.components_,
                                        fn_correct_values['PCA.components_']))
def test_fit_pca(fn):
    dates = generate_random_dates(4)
    assets = get_assets(3)

    fn_inputs = {
        'returns': pd.DataFrame(
            [
                [0.02769242, 1.34872387, 0.23460972],
                [-0.94728692, 0.68386883, -1.23987235],
                [1.93769376, -0.48275934, 0.34957348],
                [0.23985234, 0.35897345, 0.34598734]],
            dates, assets),
        'num_factor_exposures': 2,
        'svd_solver': 'full'}
    fn_correct_values = {
        'PCA': PCA(),
        'PCA.components_': np.array([
            [0.81925896, -0.40427891, 0.40666118],
            [-0.02011128, 0.68848693, 0.72496985]])}

    pca_fit = PCA.fit
    with patch.object(PCA, 'fit', autospec=True) as mock_fit:
        mock_fit.side_effect = pca_fit

        fn_return_value = fn(**fn_inputs)

        assert_structure(fn_return_value, fn_correct_values['PCA'], 'PCA')

        try:
            fn_return_value.fit.assert_called()
        except AssertionError:
            raise Exception('Test Failure: PCA.fit not called')

        try:
            fn_return_value.fit.assert_called_with(self=fn_return_value, X=fn_inputs['returns'])
        except Exception:
            raise Exception('Test Failure: PCA.fit called with the wrong arguments')

        assert_structure(fn_return_value.components_, fn_correct_values['PCA.components_'], 'PCA.components_')

        if not does_data_match(fn_return_value.components_, fn_correct_values['PCA.components_']):
            raise Exception('Test Failure: PCA not fitted correctly\n\n'
                            'PCA.components_:\n'
                            '{}\n\n'
                            'Expected PCA.components_:\n'
                            '{}'.format(fn_return_value.components_, fn_correct_values['PCA.components_']))
示例#8
0
def test_fit_pca(fn):
    dates = generate_random_dates(4)
    assets = get_assets(3)

    fn_inputs = {
        "returns":
        pd.DataFrame(
            [
                [0.02769242, 1.34872387, 0.23460972],
                [-0.94728692, 0.68386883, -1.23987235],
                [1.93769376, -0.48275934, 0.34957348],
                [0.23985234, 0.35897345, 0.34598734],
            ],
            dates,
            assets,
        ),
        "num_factor_exposures":
        2,
        "svd_solver":
        "full",
    }
    fn_correct_values = {
        "PCA":
        PCA(),
        "PCA.components_":
        np.array([
            [0.81925896, -0.40427891, 0.40666118],
            [-0.02011128, 0.68848693, 0.72496985],
        ]),
    }

    pca_fit = PCA.fit
    with patch.object(PCA, "fit", autospec=True) as mock_fit:
        mock_fit.side_effect = pca_fit

        fn_return_value = fn(**fn_inputs)

        assert_structure(fn_return_value, fn_correct_values["PCA"], "PCA")

        try:
            fn_return_value.fit.assert_called()
        except AssertionError:
            raise Exception("Test Failure: PCA.fit not called")

        try:
            fn_return_value.fit.assert_called_with(self=fn_return_value,
                                                   X=fn_inputs["returns"])
        except Exception:
            raise Exception(
                "Test Failure: PCA.fit called with the wrong arguments")

        assert_structure(
            fn_return_value.components_,
            fn_correct_values["PCA.components_"],
            "PCA.components_",
        )

        if not does_data_match(fn_return_value.components_,
                               fn_correct_values["PCA.components_"]):
            raise Exception("Test Failure: PCA not fitted correctly\n\n"
                            "PCA.components_:\n"
                            "{}\n\n"
                            "Expected PCA.components_:\n"
                            "{}".format(fn_return_value.components_,
                                        fn_correct_values["PCA.components_"]))