Пример #1
0
def test_document_distribution_empty():
    """
    Test all printable.
    :return:
    """
    # Check all dictionaries
    _ = lexnlp_tests.benchmark_extraction_func(build_document_distribution,
                                               "",
                                               characters=string.printable)
def test_document_distribution_1_custom_nn():
    """
    Test custom set.
    :return:
    """
    # Check all dictionaries
    assert_dict_equal(DOCUMENT_EXAMPLE_1_RESULT_CUSTOM_NO_NORM,
                      lexnlp_tests.benchmark_extraction_func(build_document_distribution,
                                                             DOCUMENT_EXAMPLE_1,
                                                             characters=['1', '2', '3'], norm=False))
Пример #3
0
 def test_document_line_distribution_empty(self):
     """
     Test all printable.
     """
     # Check all dictionaries
     assert_dict_equal(d1={},
                       d2=lexnlp_tests.benchmark_extraction_func(
                           func=build_document_distribution,
                           text='',
                           characters=string.printable))
Пример #4
0
 def test_document_distribution_1_custom(self):
     """
     Test custom set.
     """
     # Check all dictionaries
     assert_dict_equal(
         DOCUMENT_EXAMPLE_1_RESULT_CUSTOM,
         lexnlp_tests.benchmark_extraction_func(build_document_distribution,
                                                DOCUMENT_EXAMPLE_1,
                                                characters=['1', '2', '3']))
Пример #5
0
 def test_document_distribution_1_digits(self):
     """
     Test digits only.
     """
     # Check all dictionaries
     assert_dict_equal(
         DOCUMENT_EXAMPLE_1_RESULT_DI,
         lexnlp_tests.benchmark_extraction_func(build_document_distribution,
                                                DOCUMENT_EXAMPLE_1,
                                                characters=string.digits))
Пример #6
0
def test_document_distribution_1_print():
    """
    Test all printable.
    :return:
    """
    # Check all dictionaries
    assert_dict_equal(
        DOCUMENT_EXAMPLE_1_RESULT_PRINT,
        lexnlp_tests.benchmark_extraction_func(build_document_distribution,
                                               DOCUMENT_EXAMPLE_1,
                                               characters=string.printable))
Пример #7
0
def test_build_sentence_model():
    """
    Test the custom Punkt model.
    :return:
    """
    # Setup training text and model
    training_text = "The I.R.C. is a large body of text produced by the U.S. Congress in D.C. every year."
    sentence_segmenter = lexnlp_tests.benchmark_extraction_func(build_sentence_model, training_text)
    num_sentences_custom = len(
        sentence_segmenter.tokenize("Have you ever cited the U.S. I.R.C. to your friends?"))
    assert_equal(num_sentences_custom, 1)
Пример #8
0
def test_page_examples():
    for (_i, text, _input_args, expected) in lexnlp_tests.iter_test_data_text_and_tuple():
        def remove_whitespace(r):
            return r.replace(" ", "").replace("\n", "").replace("\r", "").replace("\t", "")

        # Get list of pages
        page_list = list(lexnlp_tests.benchmark_extraction_func(get_pages, text))
        assert len(page_list) == len(expected)
        clean_result = [remove_whitespace(p) for p in expected]
        for page in page_list:
            assert remove_whitespace(page) in clean_result
Пример #9
0
def test_date_may():
    """
    Test that " may " alone does not parse.
    :return:
    """
    # Ensure that no value is returned for either strict or non-strict mode
    nonstrict_result = lexnlp_tests.benchmark_extraction_func(get_dates_list,
                                                              "this may be a date", strict=False, return_source=True)
    strict_result = get_dates_list("this may be a date", strict=True, return_source=True)
    assert_equal(len(nonstrict_result), 0)
    assert_equal(len(strict_result), 0)
Пример #10
0
 def test_document_distribution_1_lc(self):
     """
     Test lowercase letters only.
     """
     # Check all dictionaries
     assert_dict_equal(
         DOCUMENT_EXAMPLE_1_RESULT_LC,
         lexnlp_tests.benchmark_extraction_func(
             build_document_distribution,
             DOCUMENT_EXAMPLE_1,
             characters=string.ascii_lowercase))
Пример #11
0
def test_error_case_2():
    """
    Test encountered error case.
    :return:
    """
    text = """"Revolving Commitment Termination Date" shall mean the earliest of (i) May 11, 2021, (ii) the date on 
which the Revolving Commitments are terminated pursuant to Section 2.8 and (iii) the date on which all amounts 
outstanding under this Agreement have been declared or have automatically become due and payable (whether by 
acceleration or otherwise)."""
    for _ in lexnlp_tests.benchmark_extraction_func(get_amounts, text):
        continue
Пример #12
0
def test_error_case_1():
    """
    Test encountered error case.
    :return:
    """

    text = """55	                        "Term Loan Commitment" means, with respect to each Lender, the commitment,
if any, of such Lender to make a Term Loan hereunder in the amount set forth on Annex I to this Agreement or on 
Schedule 1 to the Assignment and Assumption pursuant to which such Lender assumed its Term Loan Commitment, as 
applicable, as the same may be (a) increased from time to time pursuant to Section 2.19 and (b) reduced or increased 
from time to time pursuant to assignments by or to such Lender pursuant to Section 10.04."""

    for _ in lexnlp_tests.benchmark_extraction_func(get_amounts, text):
        continue
Пример #13
0
    def test_get_regulations_csv(self):
        """
        Test default get regulations behavior.
        :return:
        """
        test_data_path = os.path.join(
            lexnlp_test_path,
            'lexnlp/extract/en/tests/test_regulations/test_get_regulations.csv'
        )
        lexnlp_tests.test_extraction_func_on_test_data(
            get_regulations,
            expected_data_converter=lambda d: [
                (reg_type, reg_code) for reg_type, reg_code, _reg_str in d
            ],
            return_source=False,
            as_dict=False,
            test_data_path=test_data_path)
        lexnlp_tests.test_extraction_func_on_test_data(
            get_regulations,
            expected_data_converter=lambda d: [(reg_type, reg_code, reg_str)
                                               for reg_type, reg_code, reg_str
                                               in d],
            return_source=True,
            as_dict=False,
            test_data_path=test_data_path)

        cmp = DictionaryComparer(check_order=True)
        errors = []

        for (i, text, _input_args, expected) in \
                lexnlp_tests.iter_test_data_text_and_tuple(file_name=test_data_path):
            expected = [{
                'regulation_type': reg_type,
                'regulation_code': reg_code,
                'regulation_text': reg_str
            } for reg_type, reg_code, reg_str in expected]
            actual = list(
                lexnlp_tests.benchmark_extraction_func(get_regulations,
                                                       text,
                                                       return_source=True,
                                                       as_dict=True))

            line_errors = cmp.compare_list_of_dicts(expected, actual)
            if line_errors:
                line_errors_str = '\n'.join(line_errors)
                errors.append(f'Regulation tests, line [{i + 1}] errors:\n' +
                              line_errors_str)

        if errors:
            raise Exception('\n\n'.join(errors))
Пример #14
0
def run_sentence_token_test(text, result, lowercase=False, stopword=False):
    """
    Base test method to run against text with given results.
    """
    # Get list from text
    sentence_list = get_sentence_list(text)

    # Check length first
    assert len(sentence_list) == len(result)

    # Check each sentence matches
    for i in range(len(sentence_list)):
        tokens = lexnlp_tests.benchmark_extraction_func(get_token_list,
                                                        sentence_list[i], lowercase=lowercase, stopword=stopword)
        assert_list_equal(tokens, result[i])
def test_get_citations_as_dict():
    text = 'bob lissner v. test 1 F.2d 1, 2-5 (2d Cir., 1982)'
    expected = [{'citation_str': '1 F.2d 1, 2-5 (2d Cir., 1982)',
                 'court': '2d Cir.',
                 'page': 1,
                 'page2': '2-5',
                 'reporter': 'F.2d',
                 'reporter_full_name': 'Federal Reporter',
                 'volume': 1,
                 'year': 1982}]
    assert_list_equal(
        list(lexnlp_tests.benchmark_extraction_func(get_citations, text,
                                               return_source=True,
                                               as_dict=True)),
        expected
    )
Пример #16
0
    def test_page_examples(self):
        file_path = os.path.join(self.TEST_PATH, 'test_page_examples.csv')
        for (_i, text, _input_args,
             expected) in lexnlp_tests.iter_test_data_text_and_tuple(
                 file_name=file_path):

            def remove_blankspace(r):
                return r.replace(" ",
                                 "").replace("\n",
                                             "").replace("\r",
                                                         "").replace("\t", "")

            # Get list of pages
            page_list = list(
                lexnlp_tests.benchmark_extraction_func(get_pages, text))
            assert len(page_list) == len(expected)
            clean_result = [remove_blankspace(p) for p in expected]
            for page in page_list:
                assert remove_blankspace(page) in clean_result
Пример #17
0
def test_get_citations_as_dict():
    text = 'bob lissner v. test 1 F.2d 1, 2-5 (2d Cir., 1982)'
    expected = [{'citation_str': '1 F.2d 1, 2-5 (2d Cir., 1982)',
                 'court': '2d Cir.',
                 'page': 1,
                 'page2': '2-5',
                 'reporter': 'F.2d',
                 'reporter_full_name': 'Federal Reporter',
                 'volume': 1,
                 'year': 1982}]
    actual = list(lexnlp_tests.benchmark_extraction_func(get_citations,
                                                         text,
                                                         return_source=True,
                                                         as_dict=True))

    cmp = DictionaryComparer(check_order=True)
    errors = cmp.compare_list_of_dicts(expected, actual)
    if errors:
        errors_str = '\n'.join(errors)
        raise Exception('Citations test has errors:\n' + errors_str)
Пример #18
0
def run_paragraph_test(text, expected_paragraphs, window_pre=3, window_post=3):
    """
    Base test method to run against text with given results.
    """
    def remove_whitespace(r: str):
        r = r.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        while '  ' in r:
            r = r.replace('  ', ' ')
        return r.strip()

    # Get list from text
    actual_paragraphs = list(
        lexnlp_tests.benchmark_extraction_func(get_paragraphs,
                                               text,
                                               window_pre=window_pre,
                                               window_post=window_post))

    actual_paragraphs = [remove_whitespace(p) for p in actual_paragraphs]
    expected_paragraphs = [remove_whitespace(p) for p in expected_paragraphs]

    assert_list_equal(actual_paragraphs, expected_paragraphs)
Пример #19
0
def run_paragraph_test(text, result, window_pre=3, window_post=3):
    """
    Base test method to run against text with given results.
    """
    def remove_whitespace(r):
        return r.replace(" ", "").replace("\n",
                                          "").replace("\r",
                                                      "").replace("\t", "")

    # Get list from text
    para_list = list(
        lexnlp_tests.benchmark_extraction_func(get_paragraphs,
                                               text,
                                               window_pre=window_pre,
                                               window_post=window_post))

    # Check length first
    assert_equal(len(para_list), len(result))

    # Check each sentence matches
    clean_result = [remove_whitespace(para) for para in result]
    for para in para_list:
        assert_in(remove_whitespace(para), clean_result)
Пример #20
0
def test_date_feature_1_bigram():
    """
    Test date feature engineering with bigrams.
    :return:
    """
    date_feature = lexnlp_tests.benchmark_extraction_func(
        get_date_features,
        "2000-02-02",
        start_index=0,
        end_index=10,
        include_bigrams=True,
        characters=string.digits)
    assert_dict_equal(
        date_feature, {
            'bigram_02': 0.6666666666666666,
            'bigram_06': 0.0,
            'bigram_05': 0.0,
            'bigram_58': 0.0,
            'bigram_41': 0.0,
            'bigram_13': 0.0,
            'bigram_95': 0.0,
            'bigram_37': 0.0,
            'bigram_25': 0.0,
            'bigram_92': 0.0,
            'bigram_20': 0.3333333333333333,
            'bigram_71': 0.0,
            'bigram_29': 0.0,
            'bigram_52': 0.0,
            'bigram_67': 0.0,
            'bigram_96': 0.0,
            'bigram_64': 0.0,
            'char_5': 0.0,
            'bigram_27': 0.0,
            'bigram_72': 0.0,
            'bigram_80': 0.0,
            'bigram_86': 0.0,
            'bigram_12': 0.0,
            'bigram_23': 0.0,
            'bigram_38': 0.0,
            'bigram_78': 0.0,
            'bigram_14': 0.0,
            'bigram_32': 0.0,
            'bigram_45': 0.0,
            'bigram_03': 0.0,
            'bigram_83': 0.0,
            'bigram_54': 0.0,
            'char_1': 0.0,
            'bigram_28': 0.0,
            'bigram_69': 0.0,
            'bigram_35': 0.0,
            'bigram_85': 0.0,
            'bigram_68': 0.0,
            'bigram_51': 0.0,
            'bigram_26': 0.0,
            'bigram_47': 0.0,
            'bigram_46': 0.0,
            'char_2': 0.375,
            'bigram_43': 0.0,
            'bigram_48': 0.0,
            'bigram_90': 0.0,
            'char_0': 0.625,
            'bigram_50': 0.0,
            'bigram_56': 0.0,
            'bigram_62': 0.0,
            'char_4': 0.0,
            'bigram_34': 0.0,
            'bigram_70': 0.0,
            'bigram_73': 0.0,
            'bigram_15': 0.0,
            'bigram_07': 0.0,
            'bigram_30': 0.0,
            'bigram_63': 0.0,
            'bigram_74': 0.0,
            'bigram_36': 0.0,
            'bigram_19': 0.0,
            'bigram_42': 0.0,
            'bigram_53': 0.0,
            'bigram_89': 0.0,
            'bigram_40': 0.0,
            'bigram_87': 0.0,
            'bigram_01': 0.0,
            'bigram_60': 0.0,
            'bigram_76': 0.0,
            'bigram_18': 0.0,
            'bigram_09': 0.0,
            'bigram_16': 0.0,
            'bigram_24': 0.0,
            'char_3': 0.0,
            'bigram_10': 0.0,
            'bigram_17': 0.0,
            'bigram_65': 0.0,
            'bigram_31': 0.0,
            'bigram_93': 0.0,
            'bigram_59': 0.0,
            'bigram_91': 0.0,
            'bigram_61': 0.0,
            'bigram_82': 0.0,
            'char_8': 0.0,
            'char_9': 0.0,
            'bigram_39': 0.0,
            'bigram_49': 0.0,
            'bigram_81': 0.0,
            'bigram_97': 0.0,
            'bigram_75': 0.0,
            'bigram_84': 0.0,
            'bigram_08': 0.0,
            'bigram_98': 0.0,
            'bigram_79': 0.0,
            'bigram_21': 0.0,
            'bigram_04': 0.0,
            'char_7': 0.0,
            'bigram_57': 0.0,
            'char_6': 0.0,
            'bigram_94': 0.0
        })