def test_document_scores_reciprocal(self): document_df = load_data_frame(self.document_scores_file, class_labels=False, match_distance=True) scores = reciprocal_distance(document_df) expected = pd.Series([1., .5, 1 / 3]) pd.testing.assert_series_equal(scores, expected, check_names=False)
def test_paragraph_scores_reciprocal(self): paragraph_df = load_data_frame(self.paragraph_scores_file, class_labels=False, match_distance=True) scores = reciprocal_distance(paragraph_df) expected = pd.Series([1., .1, .01, .001]) pd.testing.assert_series_equal(scores, expected, check_names=False)
def test_fasttext_cv_independent_associations(self): dim = 20 bucket = 1000 cv_folds = 2 test_df = dt.load_data_frame(self.cv_test_path) test_df['text'] = test_df['text'].apply(lambda s: s.strip().lower()) cv_results = fth.fasttext_cv_independent_associations( test_df, { '-bucket': bucket, '-dim': dim }, self.ft_path, cv_folds=cv_folds, random_state=np.random.RandomState(3)) expected_col_names = [ 'mean_test_score', 'stdev_test_score', 'mean_train_score', 'stdev_train_score', 'split_0_test_score', 'split_0_train_score', 'split_0_n_test', 'split_0_pos_test', 'split_0_n_train', 'split_0_pos_train', 'split_1_test_score', 'split_1_train_score', 'split_1_n_test', 'split_1_pos_test', 'split_1_n_train', 'split_1_pos_train', ] cv_runs = 1 expected_values = [ [1.0] * cv_runs, [0.0] * cv_runs, [1.0] * cv_runs, [0.0] * cv_runs, [1.0] * cv_runs, [1.0] * cv_runs, [20] * cv_runs, [0.5] * cv_runs, [20] * cv_runs, [0.5] * cv_runs, [1.0] * cv_runs, [1.0] * cv_runs, [20] * cv_runs, [0.5] * cv_runs, [20] * cv_runs, [0.5] * cv_runs, ] expected_df = pd.DataFrame( { col: values for col, values in zip(expected_col_names, expected_values) }, columns=expected_col_names) assert_frame_equal(cv_results, expected_df)
def test_reproducibility_associations(self): test_case_df = data_tools.load_data_frame(self.test_case_df_path) run1 = cv.cv_independent_associations( test_case_df, cv_folds=3, random_state=np.random.RandomState(0)) run2 = cv.cv_independent_associations( test_case_df, cv_folds=3, random_state=np.random.RandomState(0)) for first, second in zip(run1, run2): train_first, test_first = first train_second, test_second = second np.testing.assert_array_equal(train_first, train_second) np.testing.assert_array_equal(test_first, test_second) assert len(train_first) == 4 assert len(test_first) == 2 assert len(train_second) == 4 assert len(test_second) == 2
def test_cos_random_cv_bad_param(self): cv_folds = 2 cv_iterations = 2 def cv_function(data_df, params, random_state): return cos.cv_independent_associations(data_df, params, cv_folds=cv_folds, random_state=random_state, fasttext_epochs=5, fasttext_bucket=1000, fasttext_dim=20) test_df = data_tools.load_data_frame(self.cos_cv_test_path, match_distance=True) test_df['text'] = test_df['text'].apply(lambda s: s.strip().lower()) with raises(TypeError, match="got an unexpected keyword argument"): _ = cv.random_cv(test_df, cv_function, cv_iterations, {'sentence_weightXXXX': 1}, cos.get_hyperparameter_distributions(), 3)
def test_randomness_associations(self): test_case_df = data_tools.load_data_frame(self.test_case_df_path) # since this may fail due to randomness in some cases, allow a couple of attempts max_attempts = 20 attempt = 0 while True: if attempt == max_attempts: fail( 'Failed due since no randomness in shuffled splits found.') else: try: run1 = cv.cv_independent_associations(test_case_df, cv_folds=3) run2 = cv.cv_independent_associations(test_case_df, cv_folds=3) for first, second in zip(run1, run2): train_first, test_first = first train_second, test_second = second assert not all( [x in train_first for x in train_second]) assert not all([x in test_first for x in test_second]) break except AssertionError: attempt += 1
def test_distance_scorer_exception(self): with raises(ValueError): _distance_scorer( load_data_frame(self.paragraph_scores_file, class_labels=False, match_distance=False), None)
def test_cos_random_cv(self): paragraph_weight = 3 cv_folds = 2 cv_iterations = 2 def cv_function(data_df, params, random_state): return cos.cv_independent_associations(data_df, params, cv_folds=cv_folds, random_state=random_state, fasttext_epochs=5, fasttext_bucket=1000, fasttext_dim=20) test_df = data_tools.load_data_frame(self.cos_cv_test_path, match_distance=True) test_df['text'] = test_df['text'].apply(lambda s: s.strip().lower()) cv_results = cv.random_cv(test_df, cv_function, cv_iterations, {'paragraph_weight': paragraph_weight}, cos.get_hyperparameter_distributions(), 3) expected_col_names = [ 'decay_rate', 'distance_ceiling', 'distance_offset', 'document_weight', 'paragraph_weight', 'score_cutoff', 'weighting_exponent', 'mean_test_score', 'stdev_test_score', 'mean_train_score', 'stdev_train_score', 'split_0_test_score', 'split_0_train_score', 'split_0_n_test', 'split_0_pos_test', 'split_0_n_train', 'split_0_pos_train', 'split_1_test_score', 'split_1_train_score', 'split_1_n_test', 'split_1_pos_test', 'split_1_n_train', 'split_1_pos_train', ] assert expected_col_names == list(cv_results.columns) # following parameters are chosen randomly and hence cannot be tested but only that they differ between the CV # runs random_col_names = [ 'decay_rate', 'distance_ceiling', 'distance_offset', 'document_weight', 'score_cutoff', 'weighting_exponent', ] for rand in random_col_names: assert cv_results.loc[0, rand] != cv_results.loc[1, rand] # ignore columns that are linked to test performance since this cannot be tested for random parameter choices ignore_params = [ 'mean_test_score', 'stdev_test_score', 'mean_train_score', 'stdev_train_score', 'split_0_test_score', 'split_0_train_score', 'split_1_test_score', 'split_1_train_score', ] expected_values = [ [.444] * cv_iterations, [.333] * cv_iterations, [.222] * cv_iterations, [.111] * cv_iterations, [paragraph_weight] * cv_iterations, [.111] * cv_iterations, [.111] * cv_iterations, [1.0] * cv_iterations, [0.0] * cv_iterations, [1.0] * cv_iterations, [0.0] * cv_iterations, [1.0] * cv_iterations, [1.0] * cv_iterations, [24] * cv_iterations, [0.5] * cv_iterations, [24] * cv_iterations, [0.5] * cv_iterations, [1.0] * cv_iterations, [1.0] * cv_iterations, [24] * cv_iterations, [0.5] * cv_iterations, [24] * cv_iterations, [0.5] * cv_iterations, ] expected_df = pandas.DataFrame( { col: values for col, values in zip(expected_col_names, expected_values) }, columns=expected_col_names) results_test_df = cv_results.drop(random_col_names + ignore_params, axis=1) expected_test_df = expected_df.drop(random_col_names + ignore_params, axis=1) assert_frame_equal(results_test_df, expected_test_df)
def test_fth_random_cv(self): bucket = 1000 dim = 20 cv_folds = 2 cv_iterations = 2 def cv_function(data_df, params, random_state): return fth.fasttext_cv_independent_associations( data_df, params, self.ft_path, cv_folds=cv_folds, random_state=random_state) test_df = data_tools.load_data_frame(self.ft_cv_test_path) test_df['text'] = test_df['text'].apply(lambda s: s.strip().lower()) cv_results = cv.random_cv(test_df, cv_function, cv_iterations, { '-bucket': bucket, '-dim': dim }, fth.get_hyperparameter_distributions(), 3) expected_col_names = [ 'bucket', 'dim', 'epoch', 'lr', 'wordNgrams', 'ws', 'mean_test_score', 'stdev_test_score', 'mean_train_score', 'stdev_train_score', 'split_0_test_score', 'split_0_train_score', 'split_0_n_test', 'split_0_pos_test', 'split_0_n_train', 'split_0_pos_train', 'split_1_test_score', 'split_1_train_score', 'split_1_n_test', 'split_1_pos_test', 'split_1_n_train', 'split_1_pos_train', ] assert expected_col_names == list(cv_results.columns) # following parameters are chosen randomly and hence cannot be tested but only that they differ between the CV # runs random_col_names = [ 'epoch', 'lr', 'wordNgrams', 'ws', ] for rand in random_col_names: assert cv_results.loc[0, rand] != cv_results.loc[1, rand] # ignore columns that are linked to test performance since this cannot be tested for random parameter choices ignore_params = [ 'mean_test_score', 'stdev_test_score', 'mean_train_score', 'stdev_train_score', 'split_0_test_score', 'split_0_train_score', 'split_1_test_score', 'split_1_train_score', ] expected_values = [ [1000] * cv_iterations, [20] * cv_iterations, [.1] * cv_iterations, [.2] * cv_iterations, [.3] * cv_iterations, [.4] * cv_iterations, [1.0] * cv_iterations, [0.0] * cv_iterations, [1.0] * cv_iterations, [0.0] * cv_iterations, [1.0] * cv_iterations, [1.0] * cv_iterations, [20] * cv_iterations, [0.5] * cv_iterations, [20] * cv_iterations, [0.5] * cv_iterations, [1.0] * cv_iterations, [1.0] * cv_iterations, [20] * cv_iterations, [0.5] * cv_iterations, [20] * cv_iterations, [0.5] * cv_iterations, ] expected_df = pandas.DataFrame( { col: values for col, values in zip(expected_col_names, expected_values) }, columns=expected_col_names) results_test_df = cv_results.drop(random_col_names + ignore_params, axis=1) expected_test_df = expected_df.drop(random_col_names + ignore_params, axis=1) assert_frame_equal(results_test_df, expected_test_df)