def margin(span): """Enrich a margin match.""" data = { 'subpart': 'margin', 'margin': squash([t.lower_ for t in span if t.ent_type_ in {'margin_term', 'surface'}])} return data
def convert(token): """Build a collector trait""" names = regex.split(r'\s*(?:and|with|[,&])\s*', token.group.get('col_name')) traits = [] for name, suffix in zip_longest(names, names[1:], fillvalue=''): name = regex.sub(r'\.{3,}.*', '', name) if len(name) < MIN_LEN: continue trait = Trait(start=token.start, end=token.end) trait.col_name = name if suffix.lower() in name_parts.SUFFIXES: trait.col_name = f'{name} {suffix}' if name.lower() not in name_parts.SUFFIXES: traits.append(trait) if not traits: return None if token.group.get('collector_no'): col_no = token.group['collector_no'] # Temp hack if col_no[-1] in ('m', 'M'): return None traits[0].col_no = col_no return squash(traits)
def surface(span): """Enrich a surface match.""" data = {} fields = { 'present': set(), 'subpart': set(), 'surface': [], } for token in span: label = token.ent_type_ if label in ('part', 'location'): data[label] = REPLACE.get(token.lower_, token.lower_) elif label == 'surface': fields['surface'].append(REPLACE.get(token.lower_, token.lower_)) elif label == 'subpart': fields['subpart'].add(REPLACE.get(token.lower_, token.lower_)) elif token.lower_ in PRESENT: fields['present'].add(PRESENCE.get(token.lower_, False)) if len(fields['subpart']) > 1: fields['subpart'] -= {'surface'} fields = {k: squash(v) for k, v in fields.items() if fields[k]} data = {**data, **fields} return data
def as_value(token, trait, value_field="number", unit_field="units"): """Convert token values and units to trait fields.""" units = as_list(token.group.get(unit_field, [])) trait.units = squash(units) if units else None values = [] for i, val in enumerate(as_list(token.group.get(value_field, []))): val = to_positive_float(val) if val is None: return False if i < len(units): unit = units[i] else: unit = units[-1] if units else None values.append(convert_units(val, unit)) if not values: return False trait.value = squash(values) trait.units_inferred = not bool(trait.units) return True
def shape(span): """Enrich a shape match.""" data = { 'shape': squash([ REPLACE.get(t.lower_, t.lower_) for t in span if t.ent_type_ == 'shape' ]) } if field := [t.lower_ for t in span if t.ent_type_ == 'part']: data['part'] = field[0]
def compound(token): """Handle a pattern like: 4 ft 9 in.""" trait = Trait(start=token.start, end=token.end) trait.units = [token.group["feet"], token.group["inches"]] trait.units_inferred = False trait.is_flag_missing(token, "key", rename="ambiguous_key") fts = convert_units(to_positive_float(token.group["ft"]), "ft") ins = [ convert_units(to_positive_float(i), "in") for i in as_list(token.group["in"]) ] value = [round(fts + i, 2) for i in ins] trait.value = squash(value) add_flags(token, trait) return trait
def compound(token): """Convert a compound weight like: 2 lbs. 3.1 - 4.5 oz.""" trait = Trait(start=token.start, end=token.end) trait.units = [token.group["pounds"], token.group["ounces"]] trait.units_inferred = False trait.is_flag_missing(token, "key", rename="ambiguous_key") lbs = convert_units(to_positive_float(token.group["lbs"]), "lbs") ozs = [ convert_units(to_positive_float(oz), "ozs") for oz in as_list(token.group["ozs"]) ] value = [round(lbs + oz, 2) for oz in ozs] trait.value = squash(value) add_flags(token, trait) return trait
def transfer(self, token, names): """Move fields from a token to the trait if they exist in the token.""" for name in names: if name in token.group: values = [v.lower() for v in as_list(token.group[name])] setattr(self, name, squash(values))