def try_merge_modifier_token(self, extract_result: ExtractResult, pattern: Pattern, source: str, potential_ambiguity: bool = False) -> bool: before_str = source[0:extract_result.start] after_str = source[extract_result.start:extract_result.length] # Avoid adding mod for ambiguity cases, such as "from" in "from ... to ..." should not add mod if potential_ambiguity and self.config.ambiguous_range_modifier_prefix and \ regex.search(self.config.ambiguous_range_modifier_prefix, before_str): matches = list( regex.finditer(self.config.potential_ambiguous_range_regex, source)) if matches and len(matches): return any(match.start() < extract_result.start + extract_result.length and match.end() > extract_result.start for match in matches) # return self._filter_item(extract_result, matches) token = self.has_token_index(before_str.strip(), pattern) if token.matched: mod_len = len(before_str) - token.index extract_result.length += mod_len extract_result.start -= mod_len extract_result.text = source[extract_result. start:extract_result.start + extract_result.length] extract_result.meta_data = self.assign_mod_metadata( extract_result.meta_data) return True elif self.config.check_both_before_after: # check also after_str after_str = source[extract_result.start:extract_result.length] token = self.has_token_index(after_str.strip(), pattern) if token.matched: mod_len = token.index + len(after_str) - len(after_str.strip()) extract_result.length += mod_len extract_result.text = source[extract_result. start:extract_result.start + extract_result.length] extract_result.data = Constants.HAS_MOD extract_result.meta_data = self.assign_mod_metadata( extract_result.meta_data) return True return False
def try_merge_modifier_token(self, er: ExtractResult, pattern: Pattern, source: str, potentialAmbiguity: bool = False) -> bool: before_str = source[0:er.start] # Avoid adding mod for ambiguity cases, such as "from" in "from ... to ..." should not add mod if potentialAmbiguity and self.config.ambiguous_range_modifier_prefix and regex.search( self.config.ambiguous_range_modifier_prefix, before_str): matches = list( regex.finditer(self.config.potential_ambiguous_range_regex, source)) if matches and len(matches): return self._filter_item(er, matches) token = self.has_token_index(before_str.strip(), pattern) if token.matched: mod_len = len(before_str) - token.index er.length += mod_len er.start -= mod_len er.text = source[er.start:er.start + er.length] er.meta_data = self.assign_mod_metadata(er.meta_data) return True return False
def merge_all_tokens(tokens: List[Token], source: str, extractor_name: str) -> List[ExtractResult]: result = [] merged_tokens: List[Token] = list() tokens_ = sorted(filter(None, tokens), key=lambda x: x.start) for token in tokens_: add = True for index, m_token in enumerate(merged_tokens): if not add: break if token.start >= m_token.start and token.end <= m_token.end: add = False if m_token.start < token.start < m_token.end: add = False if token.start <= m_token.start and token.end >= m_token.end: add = False merged_tokens[index] = token if add: merged_tokens.append(token) for token in merged_tokens: start = token.start length = token.length sub_str = source[start:start + length] extracted_result = ExtractResult() extracted_result.start = start extracted_result.length = length extracted_result.text = sub_str extracted_result.type = extractor_name extracted_result.data = None extracted_result.meta_data = token.metadata result.append(extracted_result) return result