def init_parser(): split_params = LineSplitParams() split_params.line_breaks = {'\n', '.', ';', '!', '?'} split_params.abbreviations = DeLanguageTokens.abbreviations split_params.abbr_ignore_case = True CopyrightDeParser.line_processor = LineProcessor( line_split_params=split_params)
def test_de_linebreaks(self): split_params = LineSplitParams() split_params.line_breaks = {'.', ';', '!', '?'} split_params.abbreviations = { 'nr.', 'abs.', 'no.', 'act.', 'inc.', 'p.' } split_params.abbr_ignore_case = True text = 'Nach der Allgemeine\nGebührenverordnung' proc = LineProcessor(line_split_params=split_params) sents = list(proc.split_text_on_line_with_endings(text)) self.assertEqual(1, len(sents))
def make_es_definitions_parser(): split_params = LineSplitParams() split_params.line_breaks = {'\n', '.', ';', '!', '?'} split_params.abbreviations = EsLanguageTokens.abbreviations split_params.abbr_ignore_case = True functions = [CommonDefinitionPatterns.match_es_def_by_semicolon, SpanishParsingMethods.match_es_def_by_hereafter, SpanishParsingMethods.match_es_def_by_reffered] parser = UniversalDefinitionsParser(functions, split_params) return parser
def make_de_definitions_parser(): split_params = LineSplitParams() split_params.line_breaks = {'\n', '.', ';', '!', '?'} split_params.abbreviations = DeLanguageTokens.abbreviations split_params.abbr_ignore_case = True functions = [CommonDefinitionPatterns.match_es_def_by_semicolon, DeutscheParsingMethods.match_ist_jeder, DeutscheParsingMethods.match_im_sinne] parser = UniversalDefinitionsParser(functions, split_params) parser.prohibited_words = {w for w in DeLanguageTokens.articles + DeLanguageTokens.conjunctions} return parser
def test_line_processor_phrases_abbr(self): text = 'Articolul saisprezece (16) nr. 2. Textul:' ptrs = LineSplitParams() ptrs.line_breaks = {'\n', '.', ';'} proc = LineProcessor(line_split_params=ptrs) lines = [line for line in proc.split_text_on_line_with_endings(text)] assert len(lines) == 3 ptrs.abbreviations = {'nr.', 'abs.'} ptrs.abbr_ignore_case = True proc = LineProcessor(line_split_params=ptrs) lines = [line for line in proc.split_text_on_line_with_endings(text)] assert len(lines) == 2
def test_de_abbrs(self): split_params = LineSplitParams() split_params.line_breaks = {'.', ';', '!', '?'} split_params.abbreviations = {'nr.', 'abs.', 'no.', 'act.', 'a.D.'} split_params.abbr_ignore_case = True text = '1000 a.D. und drang' proc = LineProcessor(line_split_params=split_params) sents = list(proc.split_text_on_line_with_endings(text)) self.assertEqual(1, len(sents)) text = '1000 A.d. und drang' sents = list(proc.split_text_on_line_with_endings(text)) self.assertGreater(len(sents), 1)
def make_es_copyrights_parser(): split_params = LineSplitParams() split_params.line_breaks = {'\n', '.', ';', '!', '?'} split_params.abbreviations = EsLanguageTokens.abbreviations split_params.abbr_ignore_case = True methods = SpanishCopyrightParsingMethods() functions = [methods.match_word_c_years, methods.match_c_years_word] p = CopyrightParser(functions, split_params) p.prohibited_words = { w for w in EsLanguageTokens.articles + EsLanguageTokens.conjunctions } return p
def __init__(self, gesetze_df: pd.DataFrame, verordnungen_df: pd.DataFrame, concept_df: pd.DataFrame): self.locale = '' parse_columns = ('Kurztitel', 'Titel', 'Abkürzung') dependent_columns = {'Titel': 'External Reference Normalized'} preformed_entity = { 'External Reference Type': 'Laws and Rules', 'External Reference Source': 'BaFin', 'External Reference Issuing Country': 'Germany' } split_params = LineSplitParams() split_params.line_breaks = {'.', ';', '!', '?'} split_params.abbreviations = DeLanguageTokens.abbreviations split_params.abbr_ignore_case = True proc = LineProcessor(line_split_params=split_params) self.gesetze_parser = DataframeEntityParser( gesetze_df, parse_columns, result_columns=dependent_columns, preformed_entity=preformed_entity, line_processor=proc) self.verordnungen_parser = DataframeEntityParser( verordnungen_df, parse_columns, result_columns=dependent_columns, preformed_entity=preformed_entity, line_processor=proc) parse_columns = ('b', ) dependent_columns = { 'b': 'External Reference Normalized', 'a': 'External Reference Type' } preformed_entity.pop('External Reference Type') self.concept_parser = DataframeEntityParser( concept_df, parse_columns, result_columns=dependent_columns, preformed_entity=preformed_entity, line_processor=proc)