def get_year_from_text(self, match: Match) -> int: year = Constants.INVALID_YEAR year_str = RegExpUtility.get_group(match, 'year') if year_str and not (str.isspace(year_str) or year_str is None): year = int(year_str) if 100 > year >= Constants.MIN_TWO_DIGIT_YEAR_PAST_NUM: year += 1900 elif 0 <= year < Constants.MAX_TWO_DIGIT_YEAR_FUTURE_NUM: year += 2000 else: first_two_year_num_str = RegExpUtility.get_group( match, Constants.FIRST_TWO_YEAR_NUM) if first_two_year_num_str and not ( str.isspace(first_two_year_num_str) or first_two_year_num_str is None): er = ExtractResult() er.text = first_two_year_num_str er.start = match.string.index( RegExpUtility.get_group(match, Constants.FIRST_TWO_YEAR_NUM)) er.length = len( RegExpUtility.get_group(match, Constants.FIRST_TWO_YEAR_NUM)) first_two_year_num = self.config.number_parser.parse(er).value if \ self.config.number_parser.parse(er).value else 0 last_two_year_num = 0 last_two_year_num_str = RegExpUtility.get_group( match, Constants.LAST_TWO_YEAR_NUM) if not (str.isspace(last_two_year_num_str) or last_two_year_num_str is None): er = ExtractResult() er.text = last_two_year_num_str er.start = match.string.index( RegExpUtility.get_group(match, Constants.LAST_TWO_YEAR_NUM)) er.length = len( RegExpUtility.get_group(match, Constants.LAST_TWO_YEAR_NUM)) last_two_year_num = self.config.number_parser.parse(er).value if \ self.config.number_parser.parse(er).value else 0 if (first_two_year_num < 100 and last_two_year_num == 0)\ or (first_two_year_num < 100 and first_two_year_num % 10 == 0 and len(last_two_year_num_str.strip().split(' ')) == 1): year = Constants.INVALID_YEAR return year if first_two_year_num >= 100: year = first_two_year_num + last_two_year_num else: year = (first_two_year_num * 100) + last_two_year_num return year
def __get_year_from_text(self, match) -> int: first_two_year_num_str = match.group('firsttwoyearnum') if first_two_year_num_str: er = ExtractResult() er.text = first_two_year_num_str er.start = match.start('firsttwoyearnum') er.length = match.end('firsttwoyearnum') - er.start first_two_year_num = self.config.number_parser.parse(er).value last_two_year_num = 0 last_two_year_num_str = match.group('lasttwoyearnum') if last_two_year_num_str: er.text = last_two_year_num_str er.start = match.start('lasttwoyearnum') er.length = match.end('lasttwoyearnum') - er.start last_two_year_num = self.config.number_parser.parse(er).value if first_two_year_num < 100 and last_two_year_num == 0 or first_two_year_num < 100 and first_two_year_num % 10 == 0 and len(last_two_year_num_str.strip().split(' ')) == 1: return -1 if first_two_year_num >= 100: return first_two_year_num + last_two_year_num return first_two_year_num * 100 + last_two_year_num else: return -1
def extract(self, source: str) -> List[ExtractResult]: result: List[ExtractResult] = list() if not self._pre_check_str(source): return result matched: List[bool] = [False] * len(source) match_source: Dict[Match, str] = dict() matches_list = list( map( lambda x: MatchesVal(matches=list(re.finditer(x.re, source)), val=x.val), self.regexes)) matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list)) for ml in matches_list: for m in ml.matches: for j in range(len(m.group())): matched[m.start() + j] = True # Keep Source Data for extra information match_source[m] = ml.val last = -1 for i in range(len(source)): if not matched[i]: last = i else: if i + 1 == len(source) or not matched[i + 1]: start = last + 1 length = i - last substring = source[start:start + length].strip() simple_tokenizer = SimpleTokenizer() if substring.startswith(Constants.IPV6_ELLIPSIS) and ( start > 0 and (str.isdigit(source[start - 1]) or (str.isalpha(source[start - 1]) and not simple_tokenizer.is_cjk( c=list(source)[start - 1])))): continue elif substring.endswith(Constants.IPV6_ELLIPSIS) and ( i + 1 < len(source) and (str.isdigit(source[i + 1]) or (str.isalpha(source[i + 1]) and not simple_tokenizer. is_cjk(c=list(source)[start - 1])))): continue src_match = next( (x for x in iter(match_source) if (x.start() == start and (x.end() - x.start()) == length)), None) if src_match is not None: value = ExtractResult() value.start = start value.length = length value.text = substring value.type = self._extract_type value.data = match_source.get(src_match, None) result.append(value) return result
def _extract_separate_units(self, source: str, num_depend_source: List[ExtractResult], non_unit_matches) -> List[ExtractResult]: result = deepcopy(num_depend_source) match_result: List[bool] = [False] * len(source) for ex_result in num_depend_source: start = ex_result.start i = 0 while i < ex_result.length: match_result[start + i] = True i += 1 match_collection = list( filter(lambda x: x.group(), regex.finditer(self.separate_regex, source))) for match in match_collection: i = 0 while i < len(match.group()) and not match_result[match.start() + i]: i += 1 if i == len(match.group()): for j in range(i): match_result[j] = True is_not_unit = False if match.group() == Constants.AMBIGUOUS_TIME_TERM: for time in non_unit_matches: if self._dimension_inside_time(match, time): is_not_unit = True if is_not_unit: continue to_add = ExtractResult() to_add.start = match.start() to_add.length = len(match.group()) to_add.text = match.group() to_add.type = self.config.extract_type num_depend_source.append(to_add)
def _extract_separate_units( self, source: str, num_depend_source: List[ExtractResult]) -> List[ExtractResult]: result = deepcopy(num_depend_source) match_result: List[bool] = [False] * len(source) for ex_result in num_depend_source: for i in range(ex_result.start, ex_result.end + 1): match_result[i] = True match_collection = list( filter(lambda x: x.group(), regex.finditer(self.separate_regex, source))) for match in match_collection: i = 0 while i < len( match.group()) and not match_result[match.start() + i]: i += 1 if i == len(match.group()): for j in range(i): match_result[j] = True to_add = ExtractResult() to_add.start = match.start() to_add.length = len(match.group()) to_add.text = match.group() to_add.type = self.config.extract_type result.append(to_add) return result
def parse(self, source: ExtractResult) -> Optional[ParseResult]: ret = ParseResult(source) number_result = None if source.data and isinstance(source.data, ExtractResult): number_result = source.data else: # if there is no unitResult, means there is just unit number_result = ExtractResult() number_result.start = -1 number_result.length = 0 number_result.text = None number_result.type = None # key contains units key = source.text unit_key_build = '' unit_keys = [] i = 0 while i <= len(key): if i == len(key): if unit_key_build: self.__add_if_not_contained( unit_keys, unit_key_build.strip()) # number_result.start is a relative position elif i == number_result.start: if unit_key_build: self.__add_if_not_contained( unit_keys, unit_key_build.strip()) unit_key_build = '' if number_result.length: i = number_result.start + number_result.length - 1 else: unit_key_build += key[i] i += 1 # Unit type depends on last unit in suffix. last_unit = unit_keys[-1] normalized_last_unit = last_unit.lower() if self.config.connector_token and normalized_last_unit.startswith(self.config.connector_token): normalized_last_unit = normalized_last_unit[len( self.config.connector_token):].strip() last_unit = last_unit[len(self.config.connector_token):].strip() if key and self.config.unit_map: unit_value = None if last_unit in self.config.unit_map: unit_value = self.config.unit_map[last_unit] elif normalized_last_unit in self.config.unit_map: unit_value = self.config.unit_map[normalized_last_unit] if unit_value: num_value = self.config.internal_number_parser.parse( number_result) if number_result.text else None resolution_str = num_value.resolution_str if num_value else None ret.value = UnitValue( number=resolution_str, unit=unit_value) ret.resolution_str = f'{resolution_str} {unit_value}'.strip() ret.text = ret.text.lower() return ret
def strip_inequality(extract_result: ExtractResult, regexp: Pattern, in_prefix: bool): if regex.search(regexp, extract_result.text): original_length = len(extract_result.text) extract_result.text = str(regexp).replace(extract_result.text, '').strip() if in_prefix: extract_result.start += original_length - len(extract_result.text) extract_result.length = len(extract_result.text) extract_result.data = ''
def extract(self, source: str): results: List[ExtractResult] = list() partial_results: List[ExtractResult] = list() trimmed_source = source.lower() if source is None or source.strip() == '': return results source_tokens = self.__tokenize(trimmed_source) for (regexp, type_extracted) in self.config.regexes_map.items(): for match in RegExpUtility.get_matches(regexp, trimmed_source): match_tokens = self.__tokenize(match) top_score = 0.0 for i in range(len(source_tokens)): score = self.match_value(source_tokens, match_tokens, i) top_score = max(top_score, score) if top_score > 0.0: value = ExtractResult() start = trimmed_source.index(match) length = len(match) text = source[start:start + length].strip() value.start = start value.length = length value.text = text value.type = type_extracted value.data = ChoiceExtractDataResult(source, top_score) partial_results.append(value) if len(partial_results) == 0: return results partial_results = sorted(partial_results, key=lambda res: res.start) if self.config.only_top_match: top_score = 0.0 top_result_index = 0 for i in range(len(partial_results)): data = ChoiceExtractDataResult(source, partial_results[i].data.score) if data.score > top_score: top_score = data.score top_result_index = i top_result = ChoiceExtractDataResult( partial_results[top_result_index].data.source, partial_results[top_result_index].data.score) top_result.other_matches = partial_results results.append(partial_results[top_result_index]) else: results = partial_results return results
def extract(self, source: str) -> List[ExtractResult]: if source is None or len(source.strip()) is 0: return list() result: List[ExtractResult] = list() match_source = dict() matched: List[bool] = [False] * len(source) matches_list = list( map( lambda x: MatchesVal( matches=list(regex.finditer(x.re, source)), val=x.val), self.regexes)) matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list)) for ml in matches_list: for m in ml.matches: for j in range(len(m.group())): matched[m.start() + j] = True # Keep Source Data for extra information match_source[m] = ml.val last = -1 for i in range(len(source)): if not matched[i]: last = i else: if i + 1 == len(source) or not matched[i + 1]: start = last + 1 length = i - last substr = source[start:start + length].strip() src_match = next( (x for x in iter(match_source) if (x.start() == start and (x.end() - x.start()) == length)), None) # extract negative numbers if self._negative_number_terms is not None: match = regex.search(self._negative_number_terms, source[0:start]) if match is not None: start = match.start() length = length + match.end() - match.start() substr = source[start:start + length].strip() if src_match is not None: value = ExtractResult() value.start = start value.length = length value.text = substr value.type = self._extract_type value.data = match_source.get(src_match, None) result.append(value) result = self._filter_ambiguity(result, source) return result
def extract(self, source: str) -> List[ExtractResult]: result: List[ExtractResult] = list() if not self._pre_check_str(source): return result matched: List[bool] = [False] * len(source) match_source: Dict[Match, str] = dict() matches_list = list( map( lambda x: MatchesVal(matches=list(re.finditer(x.re, source)), val=x.val), self.regexes)) matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list)) for ml in matches_list: for m in ml.matches: if self._is_valid_match(m): for j in range(len(m.group())): matched[m.start() + j] = True # Keep Source Data for extra information match_source[m] = ml.val last = -1 for i in range(len(source)): if not matched[i]: last = i else: if i + 1 == len(source) or not matched[i + 1]: start = last + 1 length = i - last substring = source[start:start + length].strip() src_match = next( (x for x in iter(match_source) if (x.start() == start and (x.end() - x.start()) == length)), None) if src_match is not None: value = ExtractResult() value.start = start value.length = length value.text = substring value.type = self._extract_type value.data = match_source.get(src_match, None) result.append(value) return result
def merge_all_tokens(tokens: List[Token], source: str, extractor_name: str) -> List[ExtractResult]: result = [] merged_tokens: List[Token] = list() tokens_ = sorted(filter(None, tokens), key=lambda x: x.start) for token in tokens_: add = True for index, m_token in enumerate(merged_tokens): if not add: break if token.start >= m_token.start and token.end <= m_token.end: add = False if m_token.start < token.start < m_token.end: add = False if token.start <= m_token.start and token.end >= m_token.end: add = False merged_tokens[index] = token if add: merged_tokens.append(token) for token in merged_tokens: start = token.start length = token.length sub_str = source[start:start + length] extracted_result = ExtractResult() extracted_result.start = start extracted_result.length = length extracted_result.text = sub_str extracted_result.type = extractor_name extracted_result.data = None extracted_result.meta_data = token.metadata result.append(extracted_result) return result
def extract(self, source: str) -> List[ExtractResult]: origin = source # preprocess the source sentence via extracting and replacing the numbers in it preprocess = self.__preprocess_with_number_extracted(origin) source = preprocess.source positionmap = preprocess.position extractresults = preprocess.results allmatches = list( map(lambda p: list(regex.finditer(p, source)), self.regexes)) matched: List[bool] = [False] * len(source) for matches in allmatches: for match in matches: for j in range(len(match.group())): matched[match.start() + j] = True results = list() # get index of each matched results last = -1 for i in range(len(source)): if not matched[i]: last = i else: if (i + 1) == len(source) or not matched[i + 1]: start = last + 1 length = i - last substr = source[start:start + length].strip() value = ExtractResult() value.start = start value.length = length value.text = substr value.type = self._extract_type results.append(value) # post-processing, restoring the extracted numbers results = self.__post_processing(results, origin, positionmap, extractresults) return results
def extract(self, source: str) -> List[ExtractResult]: if not self._pre_check_str(source): return [] non_unit_match = None numbers = None mapping_prefix: Dict[float, PrefixUnitResult] = dict() matched = [False] * len(source) result = [] prefix_matched = False prefix_match: List[MatchResult] = sorted(self.prefix_matcher.find(source), key=lambda o: o.start) suffix_match: List[MatchResult] = sorted(self.suffix_matcher.find(source), key=lambda o: o.start) if len(prefix_match) > 0 or len(suffix_match) > 0: numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start) if len(numbers) > 0 and self.config.extract_type is Constants.SYS_UNIT_CURRENCY and len(prefix_match) > 0 and len(suffix_match) > 0: for number in numbers: start = number.start length = number.length number_prefix = [(mr.start + mr.length) == start for mr in prefix_match] number_suffix = [mr.start == (start + length) for mr in suffix_match] if True in number_prefix and True in number_suffix and "," in number.text: comma_index = number.start + number.text.index(",") source = source[:comma_index] + " " + source[comma_index + 1:] numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start) # Special case for cases where number multipliers clash with unit ambiguous_multiplier_regex = self.config.ambiguous_unit_number_multiplier_regex if ambiguous_multiplier_regex is not None: for num in numbers: match = list(filter(lambda x: x.group(), regex.finditer( ambiguous_multiplier_regex, num.text))) if match and len(match) == 1: new_length = num.length - \ (match[0].span()[1] - match[0].span()[0]) num.text = num.text[0:new_length] num.length = new_length for number in numbers: if number.start is None or number.length is None: continue start = int(number.start) length = int(number.length) max_find_pref = min(self.max_prefix_match_len, number.start) max_find_suff = len(source) - start - length if max_find_pref != 0: last_index = start best_match = None for m in prefix_match: if m.length > 0 and m.end > start: break if m.length > 0 and source[m.start: m.start + (last_index - m.start)].strip() == m.text: best_match = m break if best_match is not None: off_set = last_index - best_match.start unit_str = source[best_match.start:best_match.start + off_set] self.add_element(mapping_prefix, number.start, (PrefixUnitResult(off_set, unit_str))) prefix_unit = mapping_prefix.get(start, None) if max_find_suff > 0: max_len = 0 first_index = start + length for m in suffix_match: if m.length > 0 and m.start >= first_index: end_pos = m.start + m.length - first_index if max_len < end_pos: mid_str = source[first_index: first_index + (m.start - first_index)] if mid_str is None or not mid_str or str.isspace(mid_str) \ or mid_str.strip() == self.config.connector_token: max_len = end_pos if max_len != 0: substr = source[start: start + length + max_len] er = ExtractResult() er.start = start er.length = length + max_len er.text = substr er.type = self.config.extract_type if prefix_unit is not None: prefix_matched = True er.start -= prefix_unit[0].offset er.length += prefix_unit[0].offset er.text = prefix_unit[0].unit + er.text # Relative position will be used in Parser number.start = start - er.start er.data = number # Special treatment, handle cases like '2:00 pm', '00 pm' is not dimension is_not_unit = False if er.type is Constants.SYS_UNIT_DIMENSION: if non_unit_match is None: non_unit_match = list(self.config.non_unit_regex.finditer(source)) for time in non_unit_match: trimmed_source = source.lower() index = trimmed_source.index(time.group()) if er.start >= time.start() and er.start + er.length <= \ time.start() + len(time.group()): is_not_unit = True break if is_not_unit: continue result.append(er) if prefix_unit and prefix_unit is not None and not prefix_matched: er = ExtractResult() er.start = number.start - prefix_unit[0].offset er.length = number.length + prefix_unit[0].offset er.text = prefix_unit[0].unit + number.text er.type = self.config.extract_type # Relative position will be used in Parser number.start = start - er.start er.data = number result.append(er) # Extract Separate unit if self.separate_regex: if non_unit_match is None: try: non_unit_match = list(self.config.non_unit_regex.match(source)) except: non_unit_match = [] self._extract_separate_units(source, result, non_unit_match) # Remove common ambiguous cases result = self._filter_ambiguity(result, source) # Expand Chinese phrase to the `half` patterns when it follows closely origin phrase. self.config.expand_half_suffix(source, result, numbers) return result
def extract(self, source: str) -> List[ExtractResult]: if not self._pre_check_str(source): return list() mapping_prefix: Dict[float, PrefixUnitResult] = dict() matched: List[bool] = [False] * len(source) numbers: List[ExtractResult] = self.config.unit_num_extractor.extract( source) result: List[ExtractResult] = list() source_len = len(source) if self.max_prefix_match_len != 0: for num in numbers: if num.start is None or num.length is None: continue max_find_prefix = min(self.max_prefix_match_len, num.start) if max_find_prefix == 0: continue left: str = source[num.start - max_find_prefix:num.start] last_index = len(left) best_match: Match = None for pattern in self.prefix_regex: collection = list( filter(lambda x: len(x.group()), regex.finditer(pattern, left))) for match in collection: if left[match.start():last_index].strip( ) == match.group(): if best_match is None or best_match.start( ) >= match.start(): best_match = match if best_match: mapping_prefix[num.start] = PrefixUnitResult( offset=last_index - best_match.start(), unit=left[best_match.start():last_index]) for num in numbers: if num.start is None or num.length is None: continue start = num.start length = num.length max_find_len = source_len - start - length prefix_unit: PrefixUnitResult = mapping_prefix.get(start, None) if max_find_len > 0: right = source[start + length:start + length + max_find_len] unit_match_list = map(lambda x: list(regex.finditer(x, right)), self.suffix_regex) unit_match = chain.from_iterable(unit_match_list) unit_match = list(filter(lambda x: x.group(), unit_match)) max_len = 0 for match in unit_match: if match.group(): end_pos = match.start() + len(match.group()) if match.start() >= 0: middle: str = right[:min(match.start(), len(right) )] if max_len < end_pos and ( not middle.strip() or middle.strip() == self.config.connector_token): max_len = end_pos if max_len != 0: for i in range(length + max_len): matched[i + start] = True ex_result = ExtractResult() ex_result.start = start ex_result.length = length + max_len ex_result.text = source[start:start + length + max_len] ex_result.type = self.config.extract_type if prefix_unit: ex_result.start -= prefix_unit.offset ex_result.length += prefix_unit.offset ex_result.text = prefix_unit.unit + ex_result.text num.start = start - ex_result.start ex_result.data = num is_not_unit = False if ex_result.type == Constants.SYS_UNIT_DIMENSION: non_unit_match = self.config.pm_non_unit_regex.finditer( source) for match in non_unit_match: if ex_result.start >= match.start( ) and ex_result.end <= match.end(): is_not_unit = True if is_not_unit: continue result.append(ex_result) continue if prefix_unit: ex_result = ExtractResult() ex_result.start = num.start - prefix_unit.offset ex_result.length = num.length + prefix_unit.offset ex_result.text = prefix_unit.unit + num.text ex_result.type = self.config.extract_type num.start = start - ex_result.start ex_result.data = num result.append(ex_result) if self.separate_regex: result = self._extract_separate_units(source, result) return result
def parse_specific_time(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() year = reference.year month = reference.month day = reference.day source = source.strip().lower() match = regex.search(self.config.specific_time_from_to_regex, source) if not match: match = regex.search(self.config.specific_time_between_and_regex, source) if not match or match.start() != 0: return result # this "from .. to .." pattern is valid if followed by a Date OR "pm" valid = False time1 = RegExpUtility.get_group(match, "time1") time2 = RegExpUtility.get_group(match, "time2") # get hours hour_group_list = RegExpUtility.get_group_list( match, Constants.HOUR_GROUP_NAME) hour_str = hour_group_list[0] begin_hour = self.config.numbers.get(hour_str, None) if not begin_hour: begin_hour = int(hour_str) hour_str = hour_group_list[1] end_hour = self.config.numbers.get(hour_str, None) if not end_hour: end_hour = int(hour_str) # get minutes minute_group_list = RegExpUtility.get_group_list( match, Constants.MINUTE_GROUP_NAME) begin_minute = end_minute = -1 if len(minute_group_list) > 1: minute_str = minute_group_list[0] begin_minute = self.config.numbers.get(minute_str, None) if not begin_minute: begin_minute = int(minute_str) minute_str = minute_group_list[1] end_minute = self.config.numbers.get(minute_str, None) if not end_minute: end_minute = int(minute_str) elif len(minute_group_list) == 1: minute_str = minute_group_list[0] if minute_str in time1: begin_minute = self.config.numbers.get(minute_str, None) if not begin_minute: begin_minute = int(minute_str) elif minute_str in time2: end_minute = self.config.numbers.get(minute_str, None) if not end_minute: end_minute = int(minute_str) # parse AM/PM left_desc: str = RegExpUtility.get_group( match, Constants.LEFT_DESC_GROUP_NAME) right_desc: str = RegExpUtility.get_group( match, Constants.RIGHT_DESC_GROUP_NAME) desc_capture_list = RegExpUtility.get_group_list( match, Constants.DESC_GROUP_NAME) for desc_capture in desc_capture_list: if desc_capture in time1 and not left_desc: left_desc: str = desc_capture elif desc_capture in time2 and not right_desc: right_desc: str = desc_capture begin_date_time = datetime( year, month, day, hour=begin_hour, minute=begin_minute if begin_minute > 0 else 0) end_date_time = datetime(year, month, day, hour=end_hour, minute=end_minute if end_minute > 0 else 0) has_left_am = left_desc != '' and left_desc.startswith('a') has_left_pm = left_desc != '' and left_desc.startswith('p') has_right_am = right_desc != '' and right_desc.startswith('a') has_right_pm = right_desc != '' and right_desc.startswith('p') has_left = has_left_am or has_left_pm has_right = has_right_am or has_right_pm # both time point has description like 'am' or 'pm' if has_left and has_right: if has_left_am: if begin_hour >= 12: begin_date_time -= timedelta(hours=12) else: if begin_hour < 12: begin_date_time += timedelta(hours=12) if has_right_am: if end_hour > 12: end_date_time -= timedelta(hours=12) else: if end_hour < 12: end_date_time += timedelta(hours=12) # one of the time point has description like 'am' or 'pm' elif has_left or has_right: if has_left_am: if begin_hour >= 12: begin_date_time -= timedelta(hours=12) if end_hour < 12: if end_date_time < begin_date_time: end_date_time += timedelta(hours=12) elif has_left_pm: if begin_hour < 12: begin_date_time += timedelta(hours=12) if end_hour < 12: if end_date_time < begin_date_time: span: datetime = begin_date_time - end_date_time end_date_time += timedelta( hours=24) if span >= timedelta( hours=12) else timedelta(hours=12) if has_right_am: if end_hour >= 12: end_date_time -= timedelta(hours=12) if begin_hour < 12: if end_date_time < begin_date_time: begin_date_time -= timedelta(hours=12) elif has_right_pm: if end_hour < 12: end_date_time += timedelta(hours=12) if begin_hour < 12: if end_date_time < begin_date_time: begin_date_time -= timedelta(hours=12) else: span = end_date_time - begin_date_time if span >= timedelta(hours=12): begin_date_time += timedelta(hours=12) # no 'am' or 'pm' indicator elif begin_hour <= 12 and end_hour <= 12: if begin_date_time > end_date_time: if begin_hour == 12: begin_date_time -= timedelta(hours=12) else: end_date_time += timedelta(hours=12) result.comment = Constants.AM_PM_GROUP_NAME if end_date_time < begin_date_time: end_date_time += timedelta(hours=24) if begin_minute >= 0: begin = f'T{begin_date_time.hour:02d}:{begin_date_time.minute:02d}' else: begin = f'T{begin_date_time.hour:02d}' if end_minute >= 0: end = f'T{end_date_time.hour:02d}:{end_date_time.minute:02d}' else: end = f'T{end_date_time.hour:02d}' difference = datetime(year, month, day) + (end_date_time - begin_date_time) if difference.minute != 0 and difference.hour != 0: result.timex = f'({begin},{end},PT{difference.hour}H{difference.minute}M)' elif difference.minute != 0 and difference.hour == 0: result.timex = f'({begin},{end},PT{difference.minute}M)' else: result.timex = f'({begin},{end},PT{difference.hour}H)' result.future_value = ResolutionStartEnd() result.past_value = ResolutionStartEnd() result.future_value.start = begin_date_time result.future_value.end = end_date_time result.past_value.start = result.future_value.start result.past_value.end = result.future_value.end result.success = True result.sub_date_time_entities = [] # in SplitDateAndTime mode, time points will be get from these sub_date_time_entities # cases like "from 4 to 5pm", "4" should not be trated as sub_date_time_entities if has_left or begin_minute >= 0: er = ExtractResult() er.start = match.start("time1") er.length = match.end("time1") - match.start("time1") er.text = time1 er.type = Constants.SYS_DATETIME_TIME pr = self.config.time_parser.parse(er, reference) result.sub_date_time_entities.append(pr) # cases like "from 4am to 5" "5" should not treated as sub_date_time_entities if has_right or end_minute >= 0: er = ExtractResult() er.start = match.start("time2") er.length = match.end("time2") - match.start("time2") er.text = time2 er.type = Constants.SYS_DATETIME_TIME pr = self.config.time_parser.parse(er, reference) result.sub_date_time_entities.append(pr) return result
def merge_date_and_time(self, source: str, reference: datetime) -> List[Token]: tokens: List[Token] = list() date_ers: List[ ExtractResult] = self.config.date_point_extractor.extract( source, reference) if not date_ers: return tokens time_ers = self.config.time_point_extractor.extract(source, reference) time_num_matches = self.config.number_as_time_regex.match(source) if len(time_ers) == 0 and time_num_matches == 0: return tokens extract_results = date_ers extract_results.extend(time_ers) # handle cases which use numbers as time points # only enabled in CalendarMode if (self.config.options & DateTimeOptions.CALENDAR) != 0: num_ers = [] idx = 0 for idx in range(idx, len(time_num_matches), 1): match = time_num_matches[idx] node = ExtractResult() node.start = source.index(match.group()) node.length = len(match.group()) node.text = match.text node.type = NumConstants.SYS_NUM_INTEGER num_ers.append(node) extract_results.extend(num_ers) extract_results = sorted(extract_results, key=lambda x: x.start) i = 0 while i < len(extract_results) - 1: j = i + 1 while j < len(extract_results) and extract_results[i].overlap( extract_results[j]): j += 1 if j >= len(extract_results): break if ((extract_results[i].type is Constants.SYS_DATETIME_DATE and extract_results[j].type is Constants.SYS_DATETIME_TIME) or (extract_results[i].type is Constants.SYS_DATETIME_TIME and extract_results[j].type is Constants.SYS_DATETIME_DATE) or (extract_results[i].type is Constants.SYS_DATETIME_DATE and extract_results[j] is NumConstants.SYS_NUM_INTEGER)): middle_begin = extract_results[i].start + ( extract_results[i].length or 0) middle_end = extract_results[j].start or 0 if middle_begin > middle_end: i = j + 1 continue middle_str = source[middle_begin:middle_end].strip() valid = False # for cases like "tomorrow 3", "tomorrow at 3" if extract_results[j].type is NumConstants.SYS_NUM_INTEGER: match = self.config.date_number_connector_regex.search( middle_str) if not middle_str or match: valid = True else: # for case like "3 pm or later on monday" match = self.config.suffix_after_regex.search(middle_str) if match: middle_str = middle_str[middle_str.index(match.group( )) + len(match.group()):len(middle_end)].strip() if not (match and len(middle_str) == 0): if self.config.is_connector_token(middle_str): valid = True if valid: begin = extract_results[i].start or 0 end = (extract_results[j].start or 0) + (extract_results[j].length or 0) end_index, start_index = self.extend_with_date_time_and_year( begin, end, source, reference) tokens.append(Token(start_index, end_index)) i = j + 1 continue i = j # handle "in the afternoon" at the end of entity idx = 0 for idx in range(idx, len(tokens), 1): after_str = source[tokens[idx].end:] match = self.config.suffix_regex.search(after_str) if match: tokens[idx] = Token(tokens[idx].start, tokens[idx].end + len(match.group())) # handle "day" prefixes idx = 0 for idx in range(idx, len(tokens), 1): before_str = source[0:tokens[idx].start] match = self.config.utility_configuration.common_date_prefix_regex.search( before_str) if match: tokens[idx] = Token(tokens[idx].start - len(match.group()), tokens[idx].end) return tokens
def __merged_compound_units(self, source: str): ers = NumberWithUnitExtractor(self.config).extract(source) ers = self.__merge_pure_number(source, ers) result = [] groups = [0] * len(ers) idx = 0 while idx < len(ers) - 1: if ers[idx].type != ers[idx + 1].type and not ers[idx].type == Constants.SYS_NUM and not ers[idx + 1].type == Constants.SYS_NUM: idx = idx + 1 continue if isinstance(ers[idx].data, ExtractResult) and not str(ers[idx].data.data).startswith("Integer"): groups[idx + 1] = groups[idx] + 1 idx = idx + 1 continue middle_begin = ers[idx].start + ers[idx].length middle_end = ers[idx + 1].start middle_str = source[middle_begin: middle_begin + (middle_end - middle_begin)].strip().lower() # Separated by whitespace if not middle_str: groups[idx + 1] = groups[idx] idx = idx + 1 continue # Separated by connector match = self.config.compound_unit_connector_regex.match(middle_str) if match is not None: splitted_match = match.string.split(" ") if match and match.pos == 0 and len(splitted_match[0]) == len(middle_str): groups[idx + 1] = groups[idx] else: groups[idx + 1] = groups[idx] + 1 idx = idx + 1 idx = 0 while idx < len(ers): if idx == 0 or groups[idx] != groups[idx - 1]: tmp_extract_result = ers[idx] tmp = ExtractResult() tmp.data = ers[idx].data tmp.length = ers[idx].length tmp.start = ers[idx].start tmp.text = ers[idx].text tmp.type = ers[idx].type tmp_extract_result.data = [tmp] result.append(tmp_extract_result) # reduce extract results in same group if idx + 1 < len(ers) and groups[idx + 1] == groups[idx]: group = groups[idx] period_begin = result[group].start period_end = ers[idx + 1].start + ers[idx + 1].length result[group].length = period_end - period_begin result[group].text = source[period_begin:period_begin + (period_end - period_begin)] result[group].type = Constants.SYS_UNIT_CURRENCY if isinstance(result[group].data, list): result[group].data.append(ers[idx + 1]) idx = idx + 1 idx = 0 while idx < len(result): inner_data = result[idx].data if len(inner_data) == 1: result[idx] = inner_data[0] idx = idx + 1 result = [x for x in result if not x.type == Constants.SYS_NUM] return result