def test_document_distribution_empty(): """ Test all printable. :return: """ # Check all dictionaries _ = lexnlp_tests.benchmark_extraction_func(build_document_distribution, "", characters=string.printable)
def test_document_distribution_1_custom_nn(): """ Test custom set. :return: """ # Check all dictionaries assert_dict_equal(DOCUMENT_EXAMPLE_1_RESULT_CUSTOM_NO_NORM, lexnlp_tests.benchmark_extraction_func(build_document_distribution, DOCUMENT_EXAMPLE_1, characters=['1', '2', '3'], norm=False))
def test_document_line_distribution_empty(self): """ Test all printable. """ # Check all dictionaries assert_dict_equal(d1={}, d2=lexnlp_tests.benchmark_extraction_func( func=build_document_distribution, text='', characters=string.printable))
def test_document_distribution_1_custom(self): """ Test custom set. """ # Check all dictionaries assert_dict_equal( DOCUMENT_EXAMPLE_1_RESULT_CUSTOM, lexnlp_tests.benchmark_extraction_func(build_document_distribution, DOCUMENT_EXAMPLE_1, characters=['1', '2', '3']))
def test_document_distribution_1_digits(self): """ Test digits only. """ # Check all dictionaries assert_dict_equal( DOCUMENT_EXAMPLE_1_RESULT_DI, lexnlp_tests.benchmark_extraction_func(build_document_distribution, DOCUMENT_EXAMPLE_1, characters=string.digits))
def test_document_distribution_1_print(): """ Test all printable. :return: """ # Check all dictionaries assert_dict_equal( DOCUMENT_EXAMPLE_1_RESULT_PRINT, lexnlp_tests.benchmark_extraction_func(build_document_distribution, DOCUMENT_EXAMPLE_1, characters=string.printable))
def test_build_sentence_model(): """ Test the custom Punkt model. :return: """ # Setup training text and model training_text = "The I.R.C. is a large body of text produced by the U.S. Congress in D.C. every year." sentence_segmenter = lexnlp_tests.benchmark_extraction_func(build_sentence_model, training_text) num_sentences_custom = len( sentence_segmenter.tokenize("Have you ever cited the U.S. I.R.C. to your friends?")) assert_equal(num_sentences_custom, 1)
def test_page_examples(): for (_i, text, _input_args, expected) in lexnlp_tests.iter_test_data_text_and_tuple(): def remove_whitespace(r): return r.replace(" ", "").replace("\n", "").replace("\r", "").replace("\t", "") # Get list of pages page_list = list(lexnlp_tests.benchmark_extraction_func(get_pages, text)) assert len(page_list) == len(expected) clean_result = [remove_whitespace(p) for p in expected] for page in page_list: assert remove_whitespace(page) in clean_result
def test_date_may(): """ Test that " may " alone does not parse. :return: """ # Ensure that no value is returned for either strict or non-strict mode nonstrict_result = lexnlp_tests.benchmark_extraction_func(get_dates_list, "this may be a date", strict=False, return_source=True) strict_result = get_dates_list("this may be a date", strict=True, return_source=True) assert_equal(len(nonstrict_result), 0) assert_equal(len(strict_result), 0)
def test_document_distribution_1_lc(self): """ Test lowercase letters only. """ # Check all dictionaries assert_dict_equal( DOCUMENT_EXAMPLE_1_RESULT_LC, lexnlp_tests.benchmark_extraction_func( build_document_distribution, DOCUMENT_EXAMPLE_1, characters=string.ascii_lowercase))
def test_error_case_2(): """ Test encountered error case. :return: """ text = """"Revolving Commitment Termination Date" shall mean the earliest of (i) May 11, 2021, (ii) the date on which the Revolving Commitments are terminated pursuant to Section 2.8 and (iii) the date on which all amounts outstanding under this Agreement have been declared or have automatically become due and payable (whether by acceleration or otherwise).""" for _ in lexnlp_tests.benchmark_extraction_func(get_amounts, text): continue
def test_error_case_1(): """ Test encountered error case. :return: """ text = """55 "Term Loan Commitment" means, with respect to each Lender, the commitment, if any, of such Lender to make a Term Loan hereunder in the amount set forth on Annex I to this Agreement or on Schedule 1 to the Assignment and Assumption pursuant to which such Lender assumed its Term Loan Commitment, as applicable, as the same may be (a) increased from time to time pursuant to Section 2.19 and (b) reduced or increased from time to time pursuant to assignments by or to such Lender pursuant to Section 10.04.""" for _ in lexnlp_tests.benchmark_extraction_func(get_amounts, text): continue
def test_get_regulations_csv(self): """ Test default get regulations behavior. :return: """ test_data_path = os.path.join( lexnlp_test_path, 'lexnlp/extract/en/tests/test_regulations/test_get_regulations.csv' ) lexnlp_tests.test_extraction_func_on_test_data( get_regulations, expected_data_converter=lambda d: [ (reg_type, reg_code) for reg_type, reg_code, _reg_str in d ], return_source=False, as_dict=False, test_data_path=test_data_path) lexnlp_tests.test_extraction_func_on_test_data( get_regulations, expected_data_converter=lambda d: [(reg_type, reg_code, reg_str) for reg_type, reg_code, reg_str in d], return_source=True, as_dict=False, test_data_path=test_data_path) cmp = DictionaryComparer(check_order=True) errors = [] for (i, text, _input_args, expected) in \ lexnlp_tests.iter_test_data_text_and_tuple(file_name=test_data_path): expected = [{ 'regulation_type': reg_type, 'regulation_code': reg_code, 'regulation_text': reg_str } for reg_type, reg_code, reg_str in expected] actual = list( lexnlp_tests.benchmark_extraction_func(get_regulations, text, return_source=True, as_dict=True)) line_errors = cmp.compare_list_of_dicts(expected, actual) if line_errors: line_errors_str = '\n'.join(line_errors) errors.append(f'Regulation tests, line [{i + 1}] errors:\n' + line_errors_str) if errors: raise Exception('\n\n'.join(errors))
def run_sentence_token_test(text, result, lowercase=False, stopword=False): """ Base test method to run against text with given results. """ # Get list from text sentence_list = get_sentence_list(text) # Check length first assert len(sentence_list) == len(result) # Check each sentence matches for i in range(len(sentence_list)): tokens = lexnlp_tests.benchmark_extraction_func(get_token_list, sentence_list[i], lowercase=lowercase, stopword=stopword) assert_list_equal(tokens, result[i])
def test_get_citations_as_dict(): text = 'bob lissner v. test 1 F.2d 1, 2-5 (2d Cir., 1982)' expected = [{'citation_str': '1 F.2d 1, 2-5 (2d Cir., 1982)', 'court': '2d Cir.', 'page': 1, 'page2': '2-5', 'reporter': 'F.2d', 'reporter_full_name': 'Federal Reporter', 'volume': 1, 'year': 1982}] assert_list_equal( list(lexnlp_tests.benchmark_extraction_func(get_citations, text, return_source=True, as_dict=True)), expected )
def test_page_examples(self): file_path = os.path.join(self.TEST_PATH, 'test_page_examples.csv') for (_i, text, _input_args, expected) in lexnlp_tests.iter_test_data_text_and_tuple( file_name=file_path): def remove_blankspace(r): return r.replace(" ", "").replace("\n", "").replace("\r", "").replace("\t", "") # Get list of pages page_list = list( lexnlp_tests.benchmark_extraction_func(get_pages, text)) assert len(page_list) == len(expected) clean_result = [remove_blankspace(p) for p in expected] for page in page_list: assert remove_blankspace(page) in clean_result
def test_get_citations_as_dict(): text = 'bob lissner v. test 1 F.2d 1, 2-5 (2d Cir., 1982)' expected = [{'citation_str': '1 F.2d 1, 2-5 (2d Cir., 1982)', 'court': '2d Cir.', 'page': 1, 'page2': '2-5', 'reporter': 'F.2d', 'reporter_full_name': 'Federal Reporter', 'volume': 1, 'year': 1982}] actual = list(lexnlp_tests.benchmark_extraction_func(get_citations, text, return_source=True, as_dict=True)) cmp = DictionaryComparer(check_order=True) errors = cmp.compare_list_of_dicts(expected, actual) if errors: errors_str = '\n'.join(errors) raise Exception('Citations test has errors:\n' + errors_str)
def run_paragraph_test(text, expected_paragraphs, window_pre=3, window_post=3): """ Base test method to run against text with given results. """ def remove_whitespace(r: str): r = r.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') while ' ' in r: r = r.replace(' ', ' ') return r.strip() # Get list from text actual_paragraphs = list( lexnlp_tests.benchmark_extraction_func(get_paragraphs, text, window_pre=window_pre, window_post=window_post)) actual_paragraphs = [remove_whitespace(p) for p in actual_paragraphs] expected_paragraphs = [remove_whitespace(p) for p in expected_paragraphs] assert_list_equal(actual_paragraphs, expected_paragraphs)
def run_paragraph_test(text, result, window_pre=3, window_post=3): """ Base test method to run against text with given results. """ def remove_whitespace(r): return r.replace(" ", "").replace("\n", "").replace("\r", "").replace("\t", "") # Get list from text para_list = list( lexnlp_tests.benchmark_extraction_func(get_paragraphs, text, window_pre=window_pre, window_post=window_post)) # Check length first assert_equal(len(para_list), len(result)) # Check each sentence matches clean_result = [remove_whitespace(para) for para in result] for para in para_list: assert_in(remove_whitespace(para), clean_result)
def test_date_feature_1_bigram(): """ Test date feature engineering with bigrams. :return: """ date_feature = lexnlp_tests.benchmark_extraction_func( get_date_features, "2000-02-02", start_index=0, end_index=10, include_bigrams=True, characters=string.digits) assert_dict_equal( date_feature, { 'bigram_02': 0.6666666666666666, 'bigram_06': 0.0, 'bigram_05': 0.0, 'bigram_58': 0.0, 'bigram_41': 0.0, 'bigram_13': 0.0, 'bigram_95': 0.0, 'bigram_37': 0.0, 'bigram_25': 0.0, 'bigram_92': 0.0, 'bigram_20': 0.3333333333333333, 'bigram_71': 0.0, 'bigram_29': 0.0, 'bigram_52': 0.0, 'bigram_67': 0.0, 'bigram_96': 0.0, 'bigram_64': 0.0, 'char_5': 0.0, 'bigram_27': 0.0, 'bigram_72': 0.0, 'bigram_80': 0.0, 'bigram_86': 0.0, 'bigram_12': 0.0, 'bigram_23': 0.0, 'bigram_38': 0.0, 'bigram_78': 0.0, 'bigram_14': 0.0, 'bigram_32': 0.0, 'bigram_45': 0.0, 'bigram_03': 0.0, 'bigram_83': 0.0, 'bigram_54': 0.0, 'char_1': 0.0, 'bigram_28': 0.0, 'bigram_69': 0.0, 'bigram_35': 0.0, 'bigram_85': 0.0, 'bigram_68': 0.0, 'bigram_51': 0.0, 'bigram_26': 0.0, 'bigram_47': 0.0, 'bigram_46': 0.0, 'char_2': 0.375, 'bigram_43': 0.0, 'bigram_48': 0.0, 'bigram_90': 0.0, 'char_0': 0.625, 'bigram_50': 0.0, 'bigram_56': 0.0, 'bigram_62': 0.0, 'char_4': 0.0, 'bigram_34': 0.0, 'bigram_70': 0.0, 'bigram_73': 0.0, 'bigram_15': 0.0, 'bigram_07': 0.0, 'bigram_30': 0.0, 'bigram_63': 0.0, 'bigram_74': 0.0, 'bigram_36': 0.0, 'bigram_19': 0.0, 'bigram_42': 0.0, 'bigram_53': 0.0, 'bigram_89': 0.0, 'bigram_40': 0.0, 'bigram_87': 0.0, 'bigram_01': 0.0, 'bigram_60': 0.0, 'bigram_76': 0.0, 'bigram_18': 0.0, 'bigram_09': 0.0, 'bigram_16': 0.0, 'bigram_24': 0.0, 'char_3': 0.0, 'bigram_10': 0.0, 'bigram_17': 0.0, 'bigram_65': 0.0, 'bigram_31': 0.0, 'bigram_93': 0.0, 'bigram_59': 0.0, 'bigram_91': 0.0, 'bigram_61': 0.0, 'bigram_82': 0.0, 'char_8': 0.0, 'char_9': 0.0, 'bigram_39': 0.0, 'bigram_49': 0.0, 'bigram_81': 0.0, 'bigram_97': 0.0, 'bigram_75': 0.0, 'bigram_84': 0.0, 'bigram_08': 0.0, 'bigram_98': 0.0, 'bigram_79': 0.0, 'bigram_21': 0.0, 'bigram_04': 0.0, 'char_7': 0.0, 'bigram_57': 0.0, 'char_6': 0.0, 'bigram_94': 0.0 })