def process_address_1(instance: Instance) -> Instance: un_normalized = instance.un_normalized normalized = instance.normalized normalized = re.sub(r"[^a-z ]", "", normalized) return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def process_cardinal_1(instance: Instance) -> Instance: un_normalized = instance.un_normalized normalized = instance.normalized un_normalized = re.sub(r"[^0-9]", "", un_normalized) normalized = re.sub(r"[^a-z ]", "", normalized) return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def process_time_1(instance: Instance) -> Instance: un_normalized = instance.un_normalized un_normalized = re.sub(r": ", ":", un_normalized) un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized) un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized) normalized = instance.normalized normalized = re.sub(r"[^a-z ]", "", normalized) return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def process_money_1(instance: Instance) -> Instance: un_normalized = instance.un_normalized normalized = instance.normalized un_normalized = re.sub(r",", "", un_normalized) un_normalized = re.sub(r"a\$", r"$", un_normalized) un_normalized = re.sub(r"us\$", r"$", un_normalized) un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized) un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized) normalized = re.sub(r"[^a-z ]", "", normalized) return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def process_measure_1(instance: Instance) -> Instance: un_normalized = instance.un_normalized normalized = instance.normalized un_normalized = re.sub(r",", "", un_normalized) un_normalized = re.sub(r"m2", "m²", un_normalized) un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized) normalized = re.sub(r"[^a-z\s]", "", normalized) normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized) normalized = re.sub(r"[^a-z ]", "", normalized) return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)
def process_verbatim_1(instance: Instance) -> Instance: un_normalized = instance.un_normalized normalized = instance.normalized return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized)