def test_line_processor_lines(self): text = """ aaa Bb b c""" proc = LineProcessor() lines = [line for line in proc.split_text_on_line_with_endings(text)] assert len(lines) == 3
def test_line_processor_phrases_de(self): text = """ (2) Vermögenswerte im Sinne dieses Gesetzes sind bebaute und unbebaute Grundstücke sowie rechtlich selbständige Gebäude und Baulichkeiten (im folgenden Grundstücke und Gebäude genannt), Nutzungsrechte und dingliche Rechte an Grundstücken oder Gebäuden, bewegliche Sachen sowie gewerbliche Schutzrechte, Urheberrechte und verwandte Schutzrechte. Vermögenswerte im Sinne dieses Gesetzes sind auch Kontoguthaben und sonstige auf Geldzahlungen gerichtete Forderungen sowie Eigentum/Beteiligungen an Unternehmen oder an Betriebsstätten/Zweigniederlassungen von Unternehmen mit Sitz außerhalb der Deutschen Demokratischen Republik. """ ptrs = LineSplitParams() ptrs.line_breaks = {'\n', '.', ';'} proc = LineProcessor(line_split_params=ptrs) lines = [line for line in proc.split_text_on_line_with_endings(text)] assert len(lines) == 3 # plus one for an empty line
def test_split_text_on_words(self): text = " While I pounded, weak and weary. Over " proc = LineProcessor() all_words = proc.split_text_on_words(text) separators = [w for w in all_words if w.is_separator] words = [w for w in all_words if not w.is_separator] assert len(separators) == 8 assert len(words) == 7
def __init__(self, parsing_functions: List[Callable[[str], List[DefinitionMatch]]], split_params: LineSplitParams): """ :param parsing_functions: a functions' collection from SpanishParsingMethods :param split_params: text-to-sentences splitting params """ self.parsing_functions = parsing_functions self.annotations = [] # type: List[dict] self.split_params = split_params self.proc = LineProcessor() self.prohibited_words = {} # words that are Not definitions per se
def test_de_linebreaks(self): split_params = LineSplitParams() split_params.line_breaks = {'.', ';', '!', '?'} split_params.abbreviations = { 'nr.', 'abs.', 'no.', 'act.', 'inc.', 'p.' } split_params.abbr_ignore_case = True text = 'Nach der Allgemeine\nGebührenverordnung' proc = LineProcessor(line_split_params=split_params) sents = list(proc.split_text_on_line_with_endings(text)) self.assertEqual(1, len(sents))
def test_line_processor_phrases(self): text = """ Once upon a midnight dreary While I pounded, weak and weary. Over many a quaint and curious volume of forgotten lore, While I nodded, nearly napping; suddenly there came a tapping, As of some one gently rapping, rapping at my chamber door.""" ptrs = LineSplitParams() ptrs.line_breaks = {'\n', '.', ';'} proc = LineProcessor(line_split_params=ptrs) lines = [line for line in proc.split_text_on_line_with_endings(text)] assert len(lines) == 6
def split_text_on_lines(self, text: str): self.estimate = ParsedTextQualityEstimate() proc = LineProcessor() self.lines = [ TypedLineOrPhrase.wrap_line(l) for l in proc.split_text_on_line_with_endings(text) ] proc.determine_line_length(text) self.estimate.avg_line_length = proc.line_length for line in self.lines: self.determine_line_type(line)
def test_de_abbrs(self): split_params = LineSplitParams() split_params.line_breaks = {'.', ';', '!', '?'} split_params.abbreviations = {'nr.', 'abs.', 'no.', 'act.', 'a.D.'} split_params.abbr_ignore_case = True text = '1000 a.D. und drang' proc = LineProcessor(line_split_params=split_params) sents = list(proc.split_text_on_line_with_endings(text)) self.assertEqual(1, len(sents)) text = '1000 A.d. und drang' sents = list(proc.split_text_on_line_with_endings(text)) self.assertGreater(len(sents), 1)
def init_parser(): split_params = LineSplitParams() split_params.line_breaks = {'\n', '.', ';', '!', '?'} split_params.abbreviations = DeLanguageTokens.abbreviations split_params.abbr_ignore_case = True CopyrightDeParser.line_processor = LineProcessor( line_split_params=split_params)
def test_line_processor_phrases_abbr(self): text = 'Articolul saisprezece (16) Nr. 2. Textul:' proc = LineProcessor() ptrs = LineSplitParams() ptrs.line_breaks = {'\n', '.', ';'} lines = [ line for line in proc.split_text_on_line_with_endings(text, ptrs) ] assert len(lines) == 3 ptrs.abbreviations = ['nr.', 'abs.'] ptrs.abbr_ignore_case = True lines = [ line for line in proc.split_text_on_line_with_endings(text, ptrs) ] assert len(lines) == 2
def __init__(self, gesetze_df: pd.DataFrame, verordnungen_df: pd.DataFrame, concept_df: pd.DataFrame): self.locale = '' parse_columns = ('Kurztitel', 'Titel', 'Abkürzung') dependent_columns = {'Titel': 'External Reference Normalized'} preformed_entity = { 'External Reference Type': 'Laws and Rules', 'External Reference Source': 'BaFin', 'External Reference Issuing Country': 'Germany' } split_params = LineSplitParams() split_params.line_breaks = {'.', ';', '!', '?'} split_params.abbreviations = DeLanguageTokens.abbreviations split_params.abbr_ignore_case = True proc = LineProcessor(line_split_params=split_params) self.gesetze_parser = DataframeEntityParser( gesetze_df, parse_columns, result_columns=dependent_columns, preformed_entity=preformed_entity, line_processor=proc) self.verordnungen_parser = DataframeEntityParser( verordnungen_df, parse_columns, result_columns=dependent_columns, preformed_entity=preformed_entity, line_processor=proc) parse_columns = ('b', ) dependent_columns = { 'b': 'External Reference Normalized', 'a': 'External Reference Type' } preformed_entity.pop('External Reference Type') self.concept_parser = DataframeEntityParser( concept_df, parse_columns, result_columns=dependent_columns, preformed_entity=preformed_entity, line_processor=proc)
def test_check_phrase_starts_with_phrase(self): text = 'While I pounded, weak and weary. Over many a quaint and curious volume of forgotten lore' proc = LineProcessor() words = proc.split_text_on_words(text) ret = proc.check_phrase_starts_with_phrase(words, 2, ['I', 'goat']) assert ret ret = proc.check_phrase_starts_with_phrase(words, 3, ['I', 'goat']) assert not ret ret = proc.check_phrase_starts_with_phrase(words, 6, ['I', 'weak']) assert ret ret = proc.check_phrase_starts_with_phrase(words, 6, ['I', ['weak', 'and']]) assert ret ret = proc.check_phrase_starts_with_phrase(words, 6, ['I', ['weak', 'weary']]) assert not ret
def test_line_processor_phrases_abbr(self): text = 'Articolul saisprezece (16) nr. 2. Textul:' ptrs = LineSplitParams() ptrs.line_breaks = {'\n', '.', ';'} proc = LineProcessor(line_split_params=ptrs) lines = list(proc.split_text_on_line_with_endings(text)) assert len(lines) == 3 ptrs.abbreviations = {'nr.', 'abs.'} ptrs.abbr_ignore_case = True proc = LineProcessor(line_split_params=ptrs) lines = list(proc.split_text_on_line_with_endings(text)) assert len(lines) == 2
class ParsedTextQualityEstimator: """ Estimates the probability of the text passed being somewhat corrupted """ sentence_break_chars = {'.', ';', '!', '?', ','} reg_numered_header = re.compile( r'(^[\s]*\(?[a-zA-Z]\)?\s)|(^[\s]*[0-9\.]+[\)]?\s)') reg_paragraph_start = re.compile(r'(^\s{2})|(^\t)') minimal_paragraph_line_length = 250 def __init__(self): self.estimate = ParsedTextQualityEstimate() self.lines = [] self.proc = LineProcessor() def estimate_text(self, text: str) -> ParsedTextQualityEstimate: """ Let's assume the text is: Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. :param text: a text containing a number of \n\n sequences, see above :return: ParsedTextQualityEstimate: {'avg_line_length': 103, 'extra_line_breaks_prob': 66, 'corrupted_prob': 66} """ self.split_text_on_lines(text) # does the text contain unnecessary line breaks? self.estimate_extra_line_breaks() # wrap up the estimate self.estimate.corrupted_prob = self.estimate.extra_line_breaks_prob return self.estimate def split_text_on_lines(self, text: str): self.estimate = ParsedTextQualityEstimate() self.lines = [ TypedLineOrPhrase.wrap_line(l) for l in self.proc.split_text_on_line_with_endings(text) ] self.proc.determine_line_length(text) self.estimate.avg_line_length = self.proc.line_length for line in self.lines: self.determine_line_type(line) def estimate_extra_line_breaks(self): lines_total = len(self.lines) if lines_total == 0: return longest_seq = 0 current_seq = 0 total_extra_breaks = 0 for indx in range(0, len(self.lines)): if self.check_line_followed_by_unnecessary_break(indx): total_extra_breaks += 1 current_seq += 1 longest_seq = max(current_seq, longest_seq) continue current_seq = 0 if total_extra_breaks > 1: p1 = 100 if longest_seq > lines_total / 3 else int( 100 * longest_seq * 2.5 / lines_total) p2 = int(100 * total_extra_breaks * 2 / lines_total) self.estimate.extra_line_breaks_prob = min(100, max(p1, p2)) def check_line_followed_by_unnecessary_break(self, line_index: int) -> bool: line = self.lines[line_index] if line.ending.count('\n') <= 1: return False if len(line.text ) > ParsedTextQualityEstimator.minimal_paragraph_line_length: # the whole line could be a paragraph return False prob_needs_extra = line.type == LineType.header if not prob_needs_extra: next_line = self.lines[ line_index + 1] if line_index < len(self.lines) - 1 else None prob_needs_extra = next_line is not None and next_line.type != LineType.regular return not prob_needs_extra def determine_line_type(self, line: TypedLineOrPhrase): p_head = self.estimate_line_is_header_prob(line.text) if p_head > 50: line.type = LineType.header return p_par_start = self.estimate_line_is_paragraph_start_prob(line.text) if p_par_start > 50: line.type = LineType.paragraph_start def estimate_line_is_paragraph_start_prob(self, line: str) -> int: if ParsedTextQualityEstimator.reg_paragraph_start.search(line): return 100 return 0 def estimate_line_is_header_prob(self, line: str) -> int: line = line.rstrip(' \t') if len(line) == 0: return 0 if line[-1] in ParsedTextQualityEstimator.sentence_break_chars: return 0 if ParsedTextQualityEstimator.reg_numered_header.search(line): return 100 if len(line) < self.estimate.avg_line_length * 0.6: return 65 # 65% chance the line is a header return 35
def __init__(self): self.estimate = ParsedTextQualityEstimate() self.lines = [] self.proc = LineProcessor()
class UniversalDefinitionsParser: """ EsDefinitionsParser searches for definitions in text according to the rules of Spanish. See the "parse" method """ def __init__(self, parsing_functions: List[Callable[[str], List[DefinitionMatch]]], split_params: LineSplitParams): """ :param parsing_functions: a functions' collection from SpanishParsingMethods :param split_params: text-to-sentences splitting params """ self.parsing_functions = parsing_functions self.annotations = [] # type: List[dict] self.split_params = split_params self.proc = LineProcessor() self.prohibited_words = {} # words that are Not definitions per se def parse(self, text: str) -> List[dict]: """ :param text: En este acuerdo, el término "Software" se refiere a: (i) el programa informático :return: { "attrs": {"start": 28, "end": 82}, "tags": {"Extracted Entity Type": "definition", "Extracted Entity Definition Name": "Software", "Extracted Entity Text": ""Software" se refiere a: (i) el programa informático"} } """ for phrase in self.proc.split_text_on_line_with_endings(text, self.split_params): matches = [] for f in self.parsing_functions: ml = f(phrase.text) matches += ml # find synonyms # sort and take the most appropriate matches matches = self.remove_prohibited_words(matches) if len(matches) > 1: matches = self.choose_best_matches(matches) matches = self.choose_more_precise_matches(matches) # trim parts of matches for match in matches: ant = { "attrs": { "start": phrase.start + match.start, "end": phrase.start + match.end }, "tags": { 'Extracted Entity Type': 'definition', 'Extracted Entity Definition Name': match.name, 'Extracted Entity Text': phrase.text[match.start: match.end] } } self.annotations.append(ant) return self.annotations def remove_prohibited_words(self, matches: List[DefinitionMatch]) -> List[DefinitionMatch]: # like 'und' or 'and' or 'the' - the word like this is not a definition itself return [m for m in matches if m.name not in self.prohibited_words] def choose_best_matches(self, matches: List[DefinitionMatch]) -> List[DefinitionMatch]: resulted = [] for k, g in groupby(matches, lambda m: m.name.strip(" \t'\"")): same_matches = list(g) if len(same_matches) > 1: same_matches = [sorted(same_matches, key=UniversalDefinitionsParser.estimate_match_quality, reverse=True)[0]] resulted += same_matches return resulted def choose_more_precise_matches(self, matches: List[DefinitionMatch]) -> List[DefinitionMatch]: """ look for a match "consumed" by other matches and spare the consuming! matches """ resulted = [] if len(matches) < 2: return matches for i in range(0, len(matches)): a = matches[i] is_consuming = False for j in range(0, len(matches)): if i == j: continue b = matches[j] if b.name in a.name: is_consuming = True break if not is_consuming: resulted.append(a) return resulted @staticmethod def estimate_match_quality(match: DefinitionMatch) -> int: return 1000 * match.probability - (match.end - match.start)
class UniversalCourtsParser: """ The class describes a "constructor" for building locale (and region) specific parsers, that find reference to courts within the text. Use the parse() method to find all reference to courts from the text provided. Each reference is a dictionary with two keys: - "attrs" key leads to the "coordinates" (starting and ending characters) of the occurrence within the provided text - "tags" key leads to another dictionary, which contains: - court official name - court's jurisdiction ... In order to parse the text you are supposed to create your locale (or region) specific instance of UniversalCourtsParser. See the constructor below: """ def __init__(self, ptrs: ParserInitParams): """ :param ptrs.court_pattern_checker: a regex or None, the parser skips the phrase if pattern doesn't match the phrase :param ptrs.column_names['type']: "Court Type", e.g. 'Federal District Court' :param ptrs.column_names['name']: "Court Name", e.g. 'Southern Georgia District Court' :param ptrs.column_names['jurisdiction']: "Jurisdiction", e.g. 'Federal' :param ptrs.column_names['alias']: "Alias", e.g. 'C.D. Cal' :param ptrs.dataframe_paths: like ['data/us_courts.csv', ...] :param ptrs.split_ptrs: phrase splitting processor parameters, see LineProcessor class :param ptrs.key_word_preproc_func: a function used to pre-process column values used in text search dataframe_paths is a collection of *.CSV files that contain the data like: | Jurisdiction || Court Type || Court Name || ... | | Federal || Verfassungsgericht || Bundesverfassungsgericht || ... | The column 'Court Name' (you may provide another column name instead of Court Name in param: court_name_column) should contain unique values that precisely identify each of the court given. The columns 'Court Type' (param: court_type_column) and 'Jurisdiction' (param: jurisdiction_column) in couple may or may not precisely identify the court given. At least this parser can identify the court's type and return the annotation that neither specifies the court's name nor jurisdiction The court_pattern_checker parameter speeds up the parsing process: - the whole text or the line would be skipped if this line doesn't match the court_pattern_checker E.g., you can pass re.compile('court', re.IGNORECASE) for searching courts' annotations for the En locale The split_ptrs specify how the parser splits the text into phrases. Each phrase can contain zero ore one court annotations. See LineProcessor class. For a courts parser phrase bounds usually include punctuation (.,;!?) and conjunctions (and, or) or (und, oder) The example function for key_word_preproc_func is: def preproc_func(text): return re.sub('e$', '[e]?', text) """ self.phrase_match_pattern = None if ptrs.court_pattern_checker is None \ else ptrs.court_pattern_checker self.court_type_column = ptrs.column_names['type'] self.court_name_column = ptrs.column_names['name'] self.court_alias_column = ptrs.column_names['alias'] self.jurisdiction_column = ptrs.column_names['jurisdiction'] self.proc = LineProcessor() self.phrase_split_ptrs = ptrs.split_ptrs self.annotations = [] self.courts = None self.load_courts(ptrs.dataframe_paths) # unique columns self.finder_court_name = PhraseFinder( UniversalCourtsParser.get_unique_col_values( self.courts[self.court_name_column]), ptrs.key_word_preproc_func) self.finder_court_alias = None if len(self.court_alias_column) == 0 else \ PhraseFinder(UniversalCourtsParser.get_unique_col_values( self.courts[self.court_alias_column]), ptrs.key_word_preproc_func) # non-unique columns self.finder_court_type = PhraseFinder( UniversalCourtsParser.get_unique_col_values( self.courts[self.court_type_column]), ptrs.key_word_preproc_func) self.finder_jur = PhraseFinder( UniversalCourtsParser.get_unique_col_values( self.courts[self.jurisdiction_column]), ptrs.key_word_preproc_func) def parse(self, text: str) -> List[dict]: """ :param text: the text being processed :return: annotations - List[dict] Here is an example of the method's call: ret = processor.parse("Bei dir läuft, deine Verfassungsgerichtshof des Freistaates Sachsen rauchen Joints vor der Kamera") ret[0]['attrs'] = {'start': 14, 'end': 97} ret[0]['tags'] = {'Extracted Entity Type': 'court', 'Extracted Entity Court Name': 'Verfassungsgerichtshof des Freistaates Sachsen', 'Extracted Entity Court Type': 'Verfassungsgericht', 'Extracted Entity Court Jurisdiction': 'Sachsen'} """ self.annotations = [] self.find_courts_by_alias_in_whole_text(text) # if the whole text doesn't contain the key word (gericht) - skip all the following if self.phrase_match_pattern is not None: if self.phrase_match_pattern.search(text, re.IGNORECASE) is None: return self.annotations for phrase in self.proc.split_text_on_line_with_endings( text, self.phrase_split_ptrs): # if the phrase doesn't contain the key word (e.g., gericht for deutsche) - skip the phrase if self.phrase_match_pattern is not None: if self.phrase_match_pattern.search(phrase.text, re.IGNORECASE) is None: continue self.find_court_by_any_key(phrase) return self.annotations def load_courts(self, dataframe_paths: List[str]): frames = [] dtypes = { self.court_type_column: str, self.court_name_column: str, self.jurisdiction_column: str } if self.court_alias_column: dtypes[self.court_alias_column] = str for path in dataframe_paths: frame = pd.read_csv(path, encoding="utf-8", error_bad_lines=False, converters=dtypes) frames.append(frame) self.courts = pd.concat(frames) def find_courts_by_alias_in_whole_text(self, text: str) -> None: if self.finder_court_alias is None: return for m in self.finder_court_alias.find_word(text): alias = m[0] rows = self.courts.loc[self.courts[self.court_alias_column] == alias] match_found = MatchFound(rows, m[1], m[2]) self.add_annotation(match_found) def find_court_by_any_key(self, phrase: LineOrPhrase): # find by court names matches = [] matches += self.find_court_by_name(phrase) matches += self.find_court_by_type_and_jurisdiction(phrase) matches = [m for m in matches if m is not None] if len(matches) == 0: return # find the best match matches.sort(key=lambda m: m.make_sort_key()) self.add_annotation(matches[0]) def find_court_by_name(self, phrase: LineOrPhrase) -> List[MatchFound]: match = self.find_court_by_key_column(phrase, self.finder_court_name, self.court_name_column) if match is None: return [] match[0].court_name = match[1][0][0] return [match[0]] def find_court_by_key_column( self, phrase: LineOrPhrase, phrase_finder: PhraseFinder, column: str) -> Tuple[MatchFound, List[PhraseMatch]]: found_substrings = phrase_finder.find_word(phrase.text, True) if len(found_substrings) == 0: return None subset = self.courts.loc[self.courts[column] == found_substrings[0][0]] if len(subset) == 0: return None match = MatchFound(subset, phrase.start + found_substrings[0][1], phrase.start + found_substrings[0][2]) return (match, found_substrings) def find_court_by_type_and_jurisdiction( self, phrase: LineOrPhrase) -> List[MatchFound]: court_types = self.finder_court_type.find_word(phrase.text, True) if len(court_types) == 0: return [] court_jurs = self.finder_jur.find_word(phrase.text, True) if len(court_types) != 1 or len(court_jurs) > 1: # special case: 2 ore more courts within the same phrase # (without commas or conjuctions) matches = [] for ct in court_types: m = MatchFound([], phrase.start + ct[1], phrase.start + ct[2]) m.court_type = ct[0] m.court_name = ct[0] matches.append(m) return matches if len(court_jurs) == 0: subset = self.courts.loc[self.courts[self.court_type_column] == court_types[0][0]] else: subset = self.courts.loc[ (self.courts[self.court_type_column] == court_types[0][0]) & (self.courts[self.jurisdiction_column] == court_jurs[0][0])] match = MatchFound(subset, phrase.start, phrase.start + court_types[0][2]) if len(subset) != 1: match.court_name = court_types[0][0] match.court_type = court_types[0][0] return [match] def add_annotation(self, match: MatchFound): mlen = len(match.subset) name = match.subset[self.court_name_column].values[0] \ if match.is_exact else \ match.court_name if match.court_name is not None else \ match.subset[self.court_name_column].values[0] if mlen > 0 else '' court_type = match.subset[self.court_type_column].values[0] \ if match.is_exact else \ match.court_type if match.court_type is not None else \ match.subset[self.court_type_column].values[0] if mlen > 0 else '' jurisdiction = match.subset[self.jurisdiction_column].values[0] \ if match.is_exact else \ match.jurisdiction if match.jurisdiction is not None else \ match.subset[self.jurisdiction_column].values[0] if mlen > 0 else '' ant = dict(attrs={ 'start': match.entry_start, 'end': match.entry_end }, tags={ 'Extracted Entity Type': 'court', 'Extracted Entity Court Name': name, 'Extracted Entity Court Type': court_type, 'Extracted Entity Court Jurisdiction': jurisdiction }) self.annotations.append(ant) @staticmethod def get_unique_col_values(col_values): return [c for c in col_values.unique() if c]
def __init__(self, ptrs: ParserInitParams): """ :param ptrs.court_pattern_checker: a regex or None, the parser skips the phrase if pattern doesn't match the phrase :param ptrs.column_names['type']: "Court Type", e.g. 'Federal District Court' :param ptrs.column_names['name']: "Court Name", e.g. 'Southern Georgia District Court' :param ptrs.column_names['jurisdiction']: "Jurisdiction", e.g. 'Federal' :param ptrs.column_names['alias']: "Alias", e.g. 'C.D. Cal' :param ptrs.dataframe_paths: like ['data/us_courts.csv', ...] :param ptrs.split_ptrs: phrase splitting processor parameters, see LineProcessor class :param ptrs.key_word_preproc_func: a function used to pre-process column values used in text search dataframe_paths is a collection of *.CSV files that contain the data like: | Jurisdiction || Court Type || Court Name || ... | | Federal || Verfassungsgericht || Bundesverfassungsgericht || ... | The column 'Court Name' (you may provide another column name instead of Court Name in param: court_name_column) should contain unique values that precisely identify each of the court given. The columns 'Court Type' (param: court_type_column) and 'Jurisdiction' (param: jurisdiction_column) in couple may or may not precisely identify the court given. At least this parser can identify the court's type and return the annotation that neither specifies the court's name nor jurisdiction The court_pattern_checker parameter speeds up the parsing process: - the whole text or the line would be skipped if this line doesn't match the court_pattern_checker E.g., you can pass re.compile('court', re.IGNORECASE) for searching courts' annotations for the En locale The split_ptrs specify how the parser splits the text into phrases. Each phrase can contain zero ore one court annotations. See LineProcessor class. For a courts parser phrase bounds usually include punctuation (.,;!?) and conjunctions (and, or) or (und, oder) The example function for key_word_preproc_func is: def preproc_func(text): return re.sub('e$', '[e]?', text) """ self.phrase_match_pattern = None if ptrs.court_pattern_checker is None \ else ptrs.court_pattern_checker self.court_type_column = ptrs.column_names['type'] self.court_name_column = ptrs.column_names['name'] self.court_alias_column = ptrs.column_names['alias'] self.jurisdiction_column = ptrs.column_names['jurisdiction'] self.proc = LineProcessor() self.phrase_split_ptrs = ptrs.split_ptrs self.annotations = [] self.courts = None self.load_courts(ptrs.dataframe_paths) # unique columns self.finder_court_name = PhraseFinder( UniversalCourtsParser.get_unique_col_values( self.courts[self.court_name_column]), ptrs.key_word_preproc_func) self.finder_court_alias = None if len(self.court_alias_column) == 0 else \ PhraseFinder(UniversalCourtsParser.get_unique_col_values( self.courts[self.court_alias_column]), ptrs.key_word_preproc_func) # non-unique columns self.finder_court_type = PhraseFinder( UniversalCourtsParser.get_unique_col_values( self.courts[self.court_type_column]), ptrs.key_word_preproc_func) self.finder_jur = PhraseFinder( UniversalCourtsParser.get_unique_col_values( self.courts[self.jurisdiction_column]), ptrs.key_word_preproc_func)
class TextPatternCollector: basic_line_processor = LineProcessor() """ EsDefinitionsParser searches for definitions in text according to the rules of Spanish. See the "parse" method """ def __init__(self, parsing_functions: List[Callable[[str], List[PatternFound]]], split_params: LineSplitParams): """ :param parsing_functions: a functions' collection from SpanishParsingMethods :param split_params: text-to-sentences splitting params """ self.parsing_functions = parsing_functions self.annotations = [] # type: List[TextAnnotation] self.split_params = split_params self.proc = LineProcessor() self.prohibited_words = {} # words that are Not definitions per se def parse(self, text: str, locale: str = None) -> List[TextAnnotation]: """ :param locale: 'En', 'De', 'Es', ... :param text: En este acuerdo, el término "Software" se refiere a: (i) el programa informático :return: { "attrs": {"start": 28, "end": 82}, "tags": {"Extracted Entity Type": "definition", "Extracted Entity Definition Name": "Software", "Extracted Entity Text": ""Software" se refiere a: (i) el programa informático"} } """ self.annotations = [] # type: List[TextAnnotation] for phrase in self.proc.split_text_on_line_with_endings( text, self.split_params): matches = [] for f in self.parsing_functions: ml = f(phrase.text) matches += ml # find synonyms # sort and take the most appropriate matches matches = self.remove_prohibited_words(matches) if len(matches) > 1: matches = self.choose_best_matches(matches) matches = self.choose_more_precise_matches(matches) # trim parts of matches for match in matches: ant = self.make_annotation_from_pattrn(locale, match, phrase) ant.coords = (ant.coords[0] + phrase.start, ant.coords[1] + phrase.start) self.annotations.append(ant) return self.annotations # pylint: disable=unused-argument def make_annotation_from_pattrn(self, locale: str, ptrn: PatternFound, phrase: LineOrPhrase) -> TextAnnotation: # should be overriden in derived class return None # pylint: enable=unused-argument def remove_prohibited_words( self, matches: List[PatternFound]) -> List[PatternFound]: # like 'und' or 'and' or 'the' - the word like this is not a definition itself return [m for m in matches if m.name not in self.prohibited_words] def choose_best_matches(self, matches: List[PatternFound]) -> List[PatternFound]: resulted = [] # pylint: disable=unused-variable for _, g in groupby(matches, lambda m: m.name.strip(" \t'\"")): # pylint: enable=unused-variable same_matches = list(g) if len(same_matches) > 1: same_matches = [ sorted(same_matches, key=TextPatternCollector.estimate_match_quality, reverse=True)[0] ] resulted += same_matches return resulted def choose_more_precise_matches( self, matches: List[PatternFound]) -> List[PatternFound]: """ look for a match "consumed" by other matches and spare the consuming! matches """ resulted = [] if len(matches) < 2: return matches for i in range(0, len(matches)): a = matches[i] a_worse_b = False for j in range(0, len(matches)): if i == j: continue b = matches[j] if a.pattern_worse_than_target(b): a_worse_b = True break if not a_worse_b: resulted.append(a) return resulted @staticmethod def estimate_match_quality(match: PatternFound) -> int: return 1000 * match.probability - (match.end - match.start)
"may be used", "is hereby changed to", "is defined", "shall be interpreted"] WEAK_TRIGGER_LIST = [r"[\(\)]", "in "] ALL_TRIGGER_LIST = STRONG_TRIGGER_LIST + WEAK_TRIGGER_LIST STRONG_TRIGGER_LIST.sort(key=len, reverse=True) WEAK_TRIGGER_LIST.sort(key=len, reverse=True) ALL_TRIGGER_LIST.sort(key=len, reverse=True) def join_collection(collection): return "|".join([w.replace(" ", r"\s+") for w in collection]) word_processor = LineProcessor() # Case 1: Term in quotes, is preceded by word|term|phrase or :,.^ # and has item from TRIGGER_LIST after itself. # Fetch term along with quotes to be able to extract multiple terms, # e.g.: the words "person" and "whoever" include TRIGGER_WORDS_PTN = r""" (?:(?:word|term|phrase)s?\s+|[:,\.]\s*|^) ['"“].{{1,{max_term_chars}}}['"”]\s* (?:{trigger_list})[\s,]""".format( max_term_chars=MAX_TERM_CHARS, trigger_list=join_collection(ALL_TRIGGER_LIST)) TRIGGER_WORDS_PTN_RE = re.compile(TRIGGER_WORDS_PTN, re.IGNORECASE | re.UNICODE | re.DOTALL | re.MULTILINE | re.VERBOSE) EXTRACT_PTN = r"""['"“](.+?)['"”\.]""" EXTRACT_PTN_RE = re.compile(EXTRACT_PTN, re.UNICODE | re.DOTALL | re.MULTILINE)