def extract(self, source: str) -> List[ExtractResult]: result: List[ExtractResult] = list() if not self._pre_check_str(source): return result matched: List[bool] = [False] * len(source) match_source: Dict[Match, str] = dict() matches_list = list( map( lambda x: MatchesVal(matches=list(re.finditer(x.re, source)), val=x.val), self.regexes)) matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list)) for ml in matches_list: for m in ml.matches: for j in range(len(m.group())): matched[m.start() + j] = True # Keep Source Data for extra information match_source[m] = ml.val last = -1 for i in range(len(source)): if not matched[i]: last = i else: if i + 1 == len(source) or not matched[i + 1]: start = last + 1 length = i - last substring = source[start:start + length].strip() simple_tokenizer = SimpleTokenizer() if substring.startswith(Constants.IPV6_ELLIPSIS) and ( start > 0 and (str.isdigit(source[start - 1]) or (str.isalpha(source[start - 1]) and not simple_tokenizer.is_cjk( c=list(source)[start - 1])))): continue elif substring.endswith(Constants.IPV6_ELLIPSIS) and ( i + 1 < len(source) and (str.isdigit(source[i + 1]) or (str.isalpha(source[i + 1]) and not simple_tokenizer. is_cjk(c=list(source)[start - 1])))): continue src_match = next( (x for x in iter(match_source) if (x.start() == start and (x.end() - x.start()) == length)), None) if src_match is not None: value = ExtractResult() value.start = start value.length = length value.text = substring value.type = self._extract_type value.data = match_source.get(src_match, None) result.append(value) return result
def strip_inequality(extract_result: ExtractResult, regexp: Pattern, in_prefix: bool): if regex.search(regexp, extract_result.text): original_length = len(extract_result.text) extract_result.text = str(regexp).replace(extract_result.text, '').strip() if in_prefix: extract_result.start += original_length - len(extract_result.text) extract_result.length = len(extract_result.text) extract_result.data = ''
def extract(self, source: str): results: List[ExtractResult] = list() partial_results: List[ExtractResult] = list() trimmed_source = source.lower() if source is None or source.strip() == '': return results source_tokens = self.__tokenize(trimmed_source) for (regexp, type_extracted) in self.config.regexes_map.items(): for match in RegExpUtility.get_matches(regexp, trimmed_source): match_tokens = self.__tokenize(match) top_score = 0.0 for i in range(len(source_tokens)): score = self.match_value(source_tokens, match_tokens, i) top_score = max(top_score, score) if top_score > 0.0: value = ExtractResult() start = trimmed_source.index(match) length = len(match) text = source[start:start + length].strip() value.start = start value.length = length value.text = text value.type = type_extracted value.data = ChoiceExtractDataResult(source, top_score) partial_results.append(value) if len(partial_results) == 0: return results partial_results = sorted(partial_results, key=lambda res: res.start) if self.config.only_top_match: top_score = 0.0 top_result_index = 0 for i in range(len(partial_results)): data = ChoiceExtractDataResult(source, partial_results[i].data.score) if data.score > top_score: top_score = data.score top_result_index = i top_result = ChoiceExtractDataResult( partial_results[top_result_index].data.source, partial_results[top_result_index].data.score) top_result.other_matches = partial_results results.append(partial_results[top_result_index]) else: results = partial_results return results
def extract(self, source: str) -> List[ExtractResult]: if source is None or len(source.strip()) is 0: return list() result: List[ExtractResult] = list() match_source = dict() matched: List[bool] = [False] * len(source) matches_list = list( map( lambda x: MatchesVal( matches=list(regex.finditer(x.re, source)), val=x.val), self.regexes)) matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list)) for ml in matches_list: for m in ml.matches: for j in range(len(m.group())): matched[m.start() + j] = True # Keep Source Data for extra information match_source[m] = ml.val last = -1 for i in range(len(source)): if not matched[i]: last = i else: if i + 1 == len(source) or not matched[i + 1]: start = last + 1 length = i - last substr = source[start:start + length].strip() src_match = next( (x for x in iter(match_source) if (x.start() == start and (x.end() - x.start()) == length)), None) # extract negative numbers if self._negative_number_terms is not None: match = regex.search(self._negative_number_terms, source[0:start]) if match is not None: start = match.start() length = length + match.end() - match.start() substr = source[start:start + length].strip() if src_match is not None: value = ExtractResult() value.start = start value.length = length value.text = substr value.type = self._extract_type value.data = match_source.get(src_match, None) result.append(value) result = self._filter_ambiguity(result, source) return result
def try_merge_modifier_token(self, extract_result: ExtractResult, pattern: Pattern, source: str, potential_ambiguity: bool = False) -> bool: before_str = source[0:extract_result.start] after_str = source[extract_result.start:extract_result.length] # Avoid adding mod for ambiguity cases, such as "from" in "from ... to ..." should not add mod if potential_ambiguity and self.config.ambiguous_range_modifier_prefix and \ regex.search(self.config.ambiguous_range_modifier_prefix, before_str): matches = list( regex.finditer(self.config.potential_ambiguous_range_regex, source)) if matches and len(matches): return any(match.start() < extract_result.start + extract_result.length and match.end() > extract_result.start for match in matches) # return self._filter_item(extract_result, matches) token = self.has_token_index(before_str.strip(), pattern) if token.matched: mod_len = len(before_str) - token.index extract_result.length += mod_len extract_result.start -= mod_len extract_result.text = source[extract_result. start:extract_result.start + extract_result.length] extract_result.meta_data = self.assign_mod_metadata( extract_result.meta_data) return True elif self.config.check_both_before_after: # check also after_str after_str = source[extract_result.start:extract_result.length] token = self.has_token_index(after_str.strip(), pattern) if token.matched: mod_len = token.index + len(after_str) - len(after_str.strip()) extract_result.length += mod_len extract_result.text = source[extract_result. start:extract_result.start + extract_result.length] extract_result.data = Constants.HAS_MOD extract_result.meta_data = self.assign_mod_metadata( extract_result.meta_data) return True return False
def extract(self, source: str) -> List[ExtractResult]: result: List[ExtractResult] = list() if not self._pre_check_str(source): return result matched: List[bool] = [False] * len(source) match_source: Dict[Match, str] = dict() matches_list = list( map( lambda x: MatchesVal(matches=list(re.finditer(x.re, source)), val=x.val), self.regexes)) matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list)) for ml in matches_list: for m in ml.matches: if self._is_valid_match(m): for j in range(len(m.group())): matched[m.start() + j] = True # Keep Source Data for extra information match_source[m] = ml.val last = -1 for i in range(len(source)): if not matched[i]: last = i else: if i + 1 == len(source) or not matched[i + 1]: start = last + 1 length = i - last substring = source[start:start + length].strip() src_match = next( (x for x in iter(match_source) if (x.start() == start and (x.end() - x.start()) == length)), None) if src_match is not None: value = ExtractResult() value.start = start value.length = length value.text = substring value.type = self._extract_type value.data = match_source.get(src_match, None) result.append(value) return result
def parse(self, source: ExtractResult) -> Optional[ParseResult]: original = source.text # do replace text & data from extended info if isinstance(source.data, list): source.text = source.data[0] source.data = source.data[1].data result: ParseResult = super().parse(source) if not result.resolution_str is None and result.resolution_str: if not result.resolution_str.strip().endswith('%'): result.resolution_str = result.resolution_str.strip() + '%' result.data = source.text result.text = original return result
def merge_all_tokens(tokens: List[Token], source: str, extractor_name: str) -> List[ExtractResult]: result = [] merged_tokens: List[Token] = list() tokens_ = sorted(filter(None, tokens), key=lambda x: x.start) for token in tokens_: add = True for index, m_token in enumerate(merged_tokens): if not add: break if token.start >= m_token.start and token.end <= m_token.end: add = False if m_token.start < token.start < m_token.end: add = False if token.start <= m_token.start and token.end >= m_token.end: add = False merged_tokens[index] = token if add: merged_tokens.append(token) for token in merged_tokens: start = token.start length = token.length sub_str = source[start:start + length] extracted_result = ExtractResult() extracted_result.start = start extracted_result.length = length extracted_result.text = sub_str extracted_result.type = extractor_name extracted_result.data = None extracted_result.meta_data = token.metadata result.append(extracted_result) return result
def __merged_compound_units(self, source: str): ers = NumberWithUnitExtractor(self.config).extract(source) ers = self.__merge_pure_number(source, ers) result = [] groups = [0] * len(ers) idx = 0 while idx < len(ers) - 1: if ers[idx].type != ers[idx + 1].type and not ers[idx].type == Constants.SYS_NUM and not ers[idx + 1].type == Constants.SYS_NUM: idx = idx + 1 continue if isinstance(ers[idx].data, ExtractResult) and not str(ers[idx].data.data).startswith("Integer"): groups[idx + 1] = groups[idx] + 1 idx = idx + 1 continue middle_begin = ers[idx].start + ers[idx].length middle_end = ers[idx + 1].start middle_str = source[middle_begin: middle_begin + (middle_end - middle_begin)].strip().lower() # Separated by whitespace if not middle_str: groups[idx + 1] = groups[idx] idx = idx + 1 continue # Separated by connector match = self.config.compound_unit_connector_regex.match(middle_str) if match is not None: splitted_match = match.string.split(" ") if match and match.pos == 0 and len(splitted_match[0]) == len(middle_str): groups[idx + 1] = groups[idx] else: groups[idx + 1] = groups[idx] + 1 idx = idx + 1 idx = 0 while idx < len(ers): if idx == 0 or groups[idx] != groups[idx - 1]: tmp_extract_result = ers[idx] tmp = ExtractResult() tmp.data = ers[idx].data tmp.length = ers[idx].length tmp.start = ers[idx].start tmp.text = ers[idx].text tmp.type = ers[idx].type tmp_extract_result.data = [tmp] result.append(tmp_extract_result) # reduce extract results in same group if idx + 1 < len(ers) and groups[idx + 1] == groups[idx]: group = groups[idx] period_begin = result[group].start period_end = ers[idx + 1].start + ers[idx + 1].length result[group].length = period_end - period_begin result[group].text = source[period_begin:period_begin + (period_end - period_begin)] result[group].type = Constants.SYS_UNIT_CURRENCY if isinstance(result[group].data, list): result[group].data.append(ers[idx + 1]) idx = idx + 1 idx = 0 while idx < len(result): inner_data = result[idx].data if len(inner_data) == 1: result[idx] = inner_data[0] idx = idx + 1 result = [x for x in result if not x.type == Constants.SYS_NUM] return result
def extract(self, source: str) -> List[ExtractResult]: if not self._pre_check_str(source): return [] non_unit_match = None numbers = None mapping_prefix: Dict[float, PrefixUnitResult] = dict() matched = [False] * len(source) result = [] prefix_matched = False prefix_match: List[MatchResult] = sorted(self.prefix_matcher.find(source), key=lambda o: o.start) suffix_match: List[MatchResult] = sorted(self.suffix_matcher.find(source), key=lambda o: o.start) if len(prefix_match) > 0 or len(suffix_match) > 0: numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start) if len(numbers) > 0 and self.config.extract_type is Constants.SYS_UNIT_CURRENCY and len(prefix_match) > 0 and len(suffix_match) > 0: for number in numbers: start = number.start length = number.length number_prefix = [(mr.start + mr.length) == start for mr in prefix_match] number_suffix = [mr.start == (start + length) for mr in suffix_match] if True in number_prefix and True in number_suffix and "," in number.text: comma_index = number.start + number.text.index(",") source = source[:comma_index] + " " + source[comma_index + 1:] numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start) # Special case for cases where number multipliers clash with unit ambiguous_multiplier_regex = self.config.ambiguous_unit_number_multiplier_regex if ambiguous_multiplier_regex is not None: for num in numbers: match = list(filter(lambda x: x.group(), regex.finditer( ambiguous_multiplier_regex, num.text))) if match and len(match) == 1: new_length = num.length - \ (match[0].span()[1] - match[0].span()[0]) num.text = num.text[0:new_length] num.length = new_length for number in numbers: if number.start is None or number.length is None: continue start = int(number.start) length = int(number.length) max_find_pref = min(self.max_prefix_match_len, number.start) max_find_suff = len(source) - start - length if max_find_pref != 0: last_index = start best_match = None for m in prefix_match: if m.length > 0 and m.end > start: break if m.length > 0 and source[m.start: m.start + (last_index - m.start)].strip() == m.text: best_match = m break if best_match is not None: off_set = last_index - best_match.start unit_str = source[best_match.start:best_match.start + off_set] self.add_element(mapping_prefix, number.start, (PrefixUnitResult(off_set, unit_str))) prefix_unit = mapping_prefix.get(start, None) if max_find_suff > 0: max_len = 0 first_index = start + length for m in suffix_match: if m.length > 0 and m.start >= first_index: end_pos = m.start + m.length - first_index if max_len < end_pos: mid_str = source[first_index: first_index + (m.start - first_index)] if mid_str is None or not mid_str or str.isspace(mid_str) \ or mid_str.strip() == self.config.connector_token: max_len = end_pos if max_len != 0: substr = source[start: start + length + max_len] er = ExtractResult() er.start = start er.length = length + max_len er.text = substr er.type = self.config.extract_type if prefix_unit is not None: prefix_matched = True er.start -= prefix_unit[0].offset er.length += prefix_unit[0].offset er.text = prefix_unit[0].unit + er.text # Relative position will be used in Parser number.start = start - er.start er.data = number # Special treatment, handle cases like '2:00 pm', '00 pm' is not dimension is_not_unit = False if er.type is Constants.SYS_UNIT_DIMENSION: if non_unit_match is None: non_unit_match = list(self.config.non_unit_regex.finditer(source)) for time in non_unit_match: trimmed_source = source.lower() index = trimmed_source.index(time.group()) if er.start >= time.start() and er.start + er.length <= \ time.start() + len(time.group()): is_not_unit = True break if is_not_unit: continue result.append(er) if prefix_unit and prefix_unit is not None and not prefix_matched: er = ExtractResult() er.start = number.start - prefix_unit[0].offset er.length = number.length + prefix_unit[0].offset er.text = prefix_unit[0].unit + number.text er.type = self.config.extract_type # Relative position will be used in Parser number.start = start - er.start er.data = number result.append(er) # Extract Separate unit if self.separate_regex: if non_unit_match is None: try: non_unit_match = list(self.config.non_unit_regex.match(source)) except: non_unit_match = [] self._extract_separate_units(source, result, non_unit_match) # Remove common ambiguous cases result = self._filter_ambiguity(result, source) # Expand Chinese phrase to the `half` patterns when it follows closely origin phrase. self.config.expand_half_suffix(source, result, numbers) return result
def extract(self, source: str) -> List[ExtractResult]: if not self._pre_check_str(source): return list() mapping_prefix: Dict[float, PrefixUnitResult] = dict() matched: List[bool] = [False] * len(source) numbers: List[ExtractResult] = self.config.unit_num_extractor.extract( source) result: List[ExtractResult] = list() source_len = len(source) if self.max_prefix_match_len != 0: for num in numbers: if num.start is None or num.length is None: continue max_find_prefix = min(self.max_prefix_match_len, num.start) if max_find_prefix == 0: continue left: str = source[num.start - max_find_prefix:num.start] last_index = len(left) best_match: Match = None for pattern in self.prefix_regex: collection = list( filter(lambda x: len(x.group()), regex.finditer(pattern, left))) for match in collection: if left[match.start():last_index].strip( ) == match.group(): if best_match is None or best_match.start( ) >= match.start(): best_match = match if best_match: mapping_prefix[num.start] = PrefixUnitResult( offset=last_index - best_match.start(), unit=left[best_match.start():last_index]) for num in numbers: if num.start is None or num.length is None: continue start = num.start length = num.length max_find_len = source_len - start - length prefix_unit: PrefixUnitResult = mapping_prefix.get(start, None) if max_find_len > 0: right = source[start + length:start + length + max_find_len] unit_match_list = map(lambda x: list(regex.finditer(x, right)), self.suffix_regex) unit_match = chain.from_iterable(unit_match_list) unit_match = list(filter(lambda x: x.group(), unit_match)) max_len = 0 for match in unit_match: if match.group(): end_pos = match.start() + len(match.group()) if match.start() >= 0: middle: str = right[:min(match.start(), len(right) )] if max_len < end_pos and ( not middle.strip() or middle.strip() == self.config.connector_token): max_len = end_pos if max_len != 0: for i in range(length + max_len): matched[i + start] = True ex_result = ExtractResult() ex_result.start = start ex_result.length = length + max_len ex_result.text = source[start:start + length + max_len] ex_result.type = self.config.extract_type if prefix_unit: ex_result.start -= prefix_unit.offset ex_result.length += prefix_unit.offset ex_result.text = prefix_unit.unit + ex_result.text num.start = start - ex_result.start ex_result.data = num is_not_unit = False if ex_result.type == Constants.SYS_UNIT_DIMENSION: non_unit_match = self.config.pm_non_unit_regex.finditer( source) for match in non_unit_match: if ex_result.start >= match.start( ) and ex_result.end <= match.end(): is_not_unit = True if is_not_unit: continue result.append(ex_result) continue if prefix_unit: ex_result = ExtractResult() ex_result.start = num.start - prefix_unit.offset ex_result.length = num.length + prefix_unit.offset ex_result.text = prefix_unit.unit + num.text ex_result.type = self.config.extract_type num.start = start - ex_result.start ex_result.data = num result.append(ex_result) if self.separate_regex: result = self._extract_separate_units(source, result) return result