"""Parse nipple state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) NIPPLE_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("false", """ false """), VOCAB.term("much", """ much """), VOCAB.term( "lactation", r""" (indicate \s+)? (( previous | post | prior ) [\s-] ) (lactation | lactating | lac ) """, ), VOCAB.term( "other", """ protuberant prominent showing worn distended """.split(), ), # Separates measurements VOCAB.part("separator", r' [;"?/,] '), # Skip arbitrary words
"""Parse sex notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) SEX = Base( name=__name__.split(".")[-1], rules=[ # JSON keys for sex VOCAB.term("sex_key", "sex"), # The sexes VOCAB.term("sex_vocab", "females? males?".split()), # These are words that indicate that "sex" is not a key VOCAB.term("not_sex", "and is was".split()), # Allow arbitrary words in some cases VOCAB.part("word", r' \b [a-z] [^;,"=:\s]* '), # Some patterns need a terminator VOCAB.part("separator", ' [;,"] | $ '), # E.g.: sex might be female; VOCAB.producer( convert, """ sex_key (?P<value> ( sex_vocab | word ){1,2} quest? ) separator """, ), # E.g.: sex=female?, Or: sex=unknown VOCAB.producer(convert, " sex_key (?P<value> ( sex_vocab | word ) quest? ) "), # E.g.: male, Or: male?
def convert(token): """Convert parsed token into a trait.""" trait = Trait( value="enlarged" if token.group.get("pos") else "not enlarged", start=token.start, end=token.end, ) return trait NIPPLES_ENLARGED = Base( name=__name__.split(".")[-1], rules=[ VOCAB["conj"], VOCAB.part("separator", r' [;"?/,] '), VOCAB.term("enlarged_abbrev", r"[oc]e[ln]"), VOCAB.term("not_enlarged_abbrev", r"[oc]s[ln]"), VOCAB.term("false", """ false """), VOCAB.producer(convert, """ (?P<pos> nipple enlarged ) """), VOCAB.producer(convert, """ (?P<pos> enlarged nipple ) """), VOCAB.producer(convert, """ (?P<pos> enlarged_abbrev ) """), VOCAB.producer(convert, """ (?P<neg> none nipple ) """), VOCAB.producer(convert, """ (?P<neg> nipple none ) """), VOCAB.producer(convert, """ (?P<neg> nipple not_enlarged ) """), VOCAB.producer(convert, """ (?P<neg> not_enlarged false? nipple ) """), VOCAB.producer(convert, """ (?P<neg> not_enlarged_abbrev ) """), ], )
return trait LACTATION_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part( "lactating", r""" ( lactating | lactation | lactated | lactate | lact | lactaing | lactacting | lactataing | lactational | oelact | celact | lactati | lactacting | lactatin | lactatting | lactatng | nursing | suckling ) \b """, ), VOCAB.term("lactating_abbrev", r"[oc][esm]l"), VOCAB.term("not_lactating_abbrev", r"[oc][esm]n"), VOCAB.term("post", r""" post | finished """), # Separates measurements VOCAB.part("separator", r' [;"/] '), VOCAB.producer(convert, """ (?P<pos> lactating ) """), VOCAB.producer(convert, """ (?P<pos> lactating_abbrev ) """), VOCAB.producer(convert, """ (?P<neg> (none | post) lactating ) """), VOCAB.producer(convert, """ (?P<neg> lactating (none | post) ) """), VOCAB.producer(convert, """ (?P<neg> not_lactating_abbrev ) """), ], )
"""Parse v****a state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) VAGINA_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part("v****a", r""" (?<! sal ) ( v****a | vag | vulva ) """), VOCAB.term("abbrev", r""" ov cv [oc][sme][ln] vc vo """.split()), VOCAB.part( "closed", r""" closed | imperforated | imperf | cerrada | non [-\s] perforated | unperforate | non [-\s] perf | clsd | imp """, ), VOCAB.part("open", r""" open | perforated? | perf | abrir """), VOCAB.part("other", r""" swollen | plugged | plug | sealed """), VOCAB.grouper("state", """ closed | open | other """), VOCAB.producer(convert, """ (?P<value> v****a partially? state ) """), VOCAB.producer(convert, """ (?P<value> state v****a state? ) """), VOCAB.producer(convert, """ (?P<value> ( state | abbrev ) v****a? ) """), ], )
def convert_state(token): """Convert parsed tokens into a result.""" trait = Trait(value="present", start=token.start, end=token.end) return trait PLACENTAL_SCAR_COUNT = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers VOCAB["shorthand"], # Adjectives to placental scars VOCAB.term( "adj", r""" faint prominent recent old possible """.split(), ), # Skip arbitrary words VOCAB["word"], VOCAB.part("sep", r" [;/] "), VOCAB.grouper( "count", """ none embryo conj | none visible | integer | none """, ), VOCAB.producer( convert_count, """(?P<count1> count ) op (?P<count2> count ) ( eq (?P<value> count ) )? plac_scar
VOCAB.part('x', r' [x×] ', capture=False) VOCAB.part('quest', r' [?] ') VOCAB.part('comma', r' [,] ', capture=False, priority=LOWEST) VOCAB.part('semicolon', r' [;] ', capture=False, priority=LOWEST) VOCAB.part('ampersand', r' [&] ', capture=False) VOCAB.part('eq', r' [=] ', capture=False) VOCAB.part('under', r' [_] ', capture=False) VOCAB.part('eol', r' [\n\r\f] ', capture=False) VOCAB.part('dot', r' [.] ', capture=False) # Small words VOCAB.part('by', r' by ', capture=False) VOCAB.part('to', r' to ', capture=False) VOCAB.part('with', r' with ', capture=False) VOCAB.part('up_to', r' ( up \s+ )? to ', capture=False) VOCAB.term('and', r' and ', capture=False) VOCAB.term('conj', ' or and '.split(), capture=False) VOCAB.term('prep', ' to with on of '.split(), capture=False) VOCAB.term('word', r' [a-z] \w* ', capture=False, priority=LOWEST) # NOTE: Double quotes as inches is handled elsewhere VOCAB.part('inches', r""" (?<! [a-z] ) ( inch e? s? | in s? (?! [a-ru-wyz] ) ) """) VOCAB.part( 'feet', r""" (?<! [a-z] ) ( foot s? | feet s? | ft s? (?! [,\w]) ) | (?<= \d ) ' """) VOCAB.part( 'metric_len', r""" ( milli | centi )? meters? | ( [cm] [\s.]? m ) (?! [a-ru-wyz] ) """) VOCAB.grouper('len_units', ' metric_len feet inches'.split())
return squash(traits) COLLECTOR = Base( name='collector', rules=[ VOCAB['eol'], VOCAB['month_name'], STATE_NAMES, VOCAB.part('col_label', r""" \b ( collect(or|ed) | coll | col ) ( \s* by )? """, capture=False), VOCAB.term('no_label', r""" number no num """.split(), capture=False), VOCAB.term('part', r""" [[:alpha:]]+ """, priority=LOWEST, capture=False), VOCAB.term('other_label', r""" art artist ass assist assistant auth authors? cartographer conservator contributor corator curator curatorial det determiner dir director ecologist editor entomologist expedition explorer extractor gardener geographer geologist georeferencer grower herbarium horticulturalist illustrator manager naturalist
import pandas as pd import regex from traiter.old.vocabulary import Vocabulary from digi_leap.pylib import const, patterns STATE_CSV = const.DATA_DIR / 'US_states.csv' STATES = {} STATE_NAMES = [] NORMALIZE_US_STATE = {} VOCAB = Vocabulary(patterns.VOCAB) VOCAB.term( 'USA', r""" U\.?S\.?A\.? | U\.?S\.? | United \s? States \s? of \s? America | United \s? States | U\.? \s? of \s? A\.?""") def normalize_key(state: str) -> str: """Convert state abbreviations into a consistent key.""" return regex.sub(r'[^a-z]+', '', state.lower()) def normalize_state(state: str) -> str: """Convert state abbreviations to the state name.""" return NORMALIZE_US_STATE.get(normalize_key(state), state.title()) def build_state(state, postal, abbrev_blob):
def fix_up(trait, text): """Fix problematic parses.""" # Try to disambiguate doubles quotes from inches return fix_up_inches(trait, text) TRAGUS_LENGTH = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Units are in the key, like: tragusLengthInMillimeters VOCAB.term( "key_with_units", r"""( tragus \s* ) \s* ( length | len ) \s* in \s* (?P<units> millimeters | mm ) """, ), # Standard keywords that indicate a tragus length follows VOCAB.term( "key", r""" ( tragus | trag | tragi ) \s* (length | len | l )? | tr """, ), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False), VOCAB.grouper("noise", " word dash ".split()), # Handle fractional values like: tragus 9/16" VOCAB.producer( fraction, [ "key len_fraction units", # E.g.: tragus = 9/16 inches
if IS_LEFT.search(text, trait.end, end): return True return False TOTAL_LENGTH = Base( name=__name__.split(".")[-1], fix_up=fix_up, rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Units are in the key, like: TotalLengthInMillimeters VOCAB.term( "key_with_units", r""" ( total | snout \s* vent | head \s* body | fork ) \s* ( length | len )? \s* in \s* (?P<units> millimeters | mm ) """, ), # Various total length keys VOCAB.part( "len_key", r""" t \s* [o.]? \s* l [._]? (?! [a-z] ) | total [\s-]* length [\s-]* in | ( total | max | standard ) [\s-]* lengths? \b | meas [\s*:]? \s* length [\s(]* [l] [)\s:]* | meas ( [a-z]* )? \.? : \s* l (?! [a-z.] ) | s \.? \s? l \.? (?! [a-z.] ) | label [\s.]* lengths? \b | ( fork | mean | body ) [\s-]* lengths? \b
import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.reproductive import convert, double VOCAB = Vocabulary(patterns.VOCAB) OVARY_SIZE = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # A key with units, like: gonadLengthInMM VOCAB.term( "key_with_units", r""" (?P<ambiguous_key> gonad ) \s* (?P<dim> length | len | width ) \s* in \s* (?P<len_units> millimeters | mm ) """, ), VOCAB.grouper("value", " cross | number len_units? "), # E.g.: active, Or: immature VOCAB.grouper("state", "active mature destroyed visible developed".split()), # Male or female ambiguous, like: gonadLength1 VOCAB.grouper( "ambiguous", """ ambiguous_key dim_side | side ambiguous_key dimension | ambiguous_key dimension """,
from vertnet.parsers.base import Base from vertnet.pylib.trait import Trait VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert parsed token into a trait.""" trait = Trait( value="pregnant" if token.group.get("pos") else "not pregnant", start=token.start, end=token.end, ) return trait PREGNANCY_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term( "pregnant", r""" prega?n?ant pregnan preg pregnancy pregnancies gravid """. split(), ), VOCAB.part("separator", r' [;,"] '), VOCAB.producer(convert, """ (?P<neg> pregnant none) """), VOCAB.producer(convert, """ (?P<neg> none pregnant ) """), VOCAB.producer(convert, """ (?P<pos> pregnant ) """), ], )
"""Parse pregnancy state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) PREGNANCY_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term( "pregnant", r""" prega?n?ant pregnan preg pregnancy pregnancies gravid post[\s\-]?parous multiparous nulliparous parous primiparous """.split(), ), VOCAB.term("joiner", r""" of were """.split()), VOCAB.term( "recent", r""" recently recent was previously prev """.split(), ), VOCAB.term( "probably", r""" probably prob possibly possible appears? very visible visibly evidence evident
def typed(token): """Convert single value tokens into a result.""" trait = Trait(start=token.start, end=token.end) trait.notation = token.group["notation"] trait.value = to_positive_int(token.group["value1"]) trait.value += to_positive_int(token.group.get("value2")) return trait NIPPLE_COUNT = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers VOCAB.term("id", r" \d+-\d+ "), VOCAB.term("adj", r""" inguinal ing pectoral pec pr """.split()), VOCAB.part("number", r" number | no | [#] "), VOCAB.part("eq", r" is | eq | equals? | [=] "), # Skip arbitrary words VOCAB["word"], VOCAB["sep"], VOCAB.grouper("count", " (?: integer | none )(?! side ) "), VOCAB.grouper("modifier", "adj visible".split()), VOCAB.grouper("skip", " number eq? integer "), VOCAB.producer( typed, """ (?P<notation> (?P<value1> count) modifier (?P<value2> count) modifier ) nipple
def fix_up(trait, text): """Fix problematic parses.""" # Try to disambiguate doubles quotes from inches return fix_up_inches(trait, text) HIND_FOOT_LENGTH = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Units are in the key, like: HindFootLengthInMillimeters VOCAB.term( "key_with_units", r"""( hind \s* )? foot \s* ( length | len ) \s* in \s* (?P<units> millimeters | mm ) """, ), # Standard keywords that indicate a hind foot length follows VOCAB.term( "key", [ r"hind \s* foot \s* with \s* (?P<includes> claw )", r"hind \s* foot ( \s* ( length | len ) )?", "hfl | hf", ], ), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False), VOCAB.grouper("noise", " word dash ".split()), # Handle fractional values like: hindFoot 9/16"
] value = [round(lbs + oz, 2) for oz in ozs] trait.value = squash(value) add_flags(token, trait) return trait BODY_MASS = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Looking for keys like: MassInGrams VOCAB.term( "key_with_units", r""" ( weight | mass) [\s-]* in [\s-]* (?P<mass_units> grams | g | lbs ) """, ), # These words indicate a body mass follows VOCAB.part("key_leader", "full observed total".split()), # Words for weight VOCAB.part("weight", "weights? weigh(ed|ing|s)?".split()), # Keys like: w.t. VOCAB.part("key_with_dots", r" \b w \.? \s? t s? \.? "), # Common prefixes that indicate a body mass VOCAB.part("mass", "mass"), VOCAB.part("body", "body"), # These indicate that the mass is NOT a body mass VOCAB.term( "other_wt",
return None # Try to disambiguate doubles quotes from inches return numeric_fix_ups(trait, text) EAR_LENGTH = Base( name=__name__.split(".")[-1], fix_up=fix_up, rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Units are in the key, like: EarLengthInMillimeters VOCAB.term( "key_with_units", r""" ear \s* ( length | len ) \s* in \s* (?P<len_units> millimeters | mm ) """, ), # Abbreviation containing the measured from notation, like: e/n or e/c VOCAB.part( "char_measured_from", r""" (?<! [a-z] ) (?<! [a-z] \s ) (?P<ambiguous_key> e ) /? (?P<measured_from1> n | c ) [-]? (?! \.? [a-z] ) """, ), # The abbreviation key, just: e. This can be a problem. VOCAB.part( "char_key",
) trait2 = Trait( value=token.group["value"][1].lower(), side=token.group["side"][1].lower(), start=token.start, end=token.end, ) return [trait1, trait2] OVARIES_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("other", """ sev somewhat few """.split()), # Skip words VOCAB.term("skip", " womb nullip ".split()), # VOCAB['comma'], VOCAB.part("sep", r" [;\(] "), # E.g.: ovaries and uterine horns # Or: ovaries and fallopian tubes VOCAB.grouper( "ovaries", r""" ovary ( ( and? uterus horns? ) | and? fallopian )? """, ), # E.g.: covered in copious fat VOCAB.grouper("coverage", " covered word{0,2} fat "), # E.g.: +corpus luteum
has_year = any(x for x in digits if len(x) >= YEAR_LEN) if not (has_month and has_year): return None trait = convert(token) if trait: trait.value = str(trait.value[:-2]) + '??' return trait LABEL_DATE = Base( name=__name__.split('.')[-1], rules=[ VOCAB['eol'], VOCAB['uuid'], # Get rid of these before they're a problem VOCAB.term('label', ' date '.split()), VOCAB.part('digits', r'(?<! \d ) ( [12]\d{3} | \d{1,2} ) (?! \d )'), VOCAB.part('sep', r' [/_-]+ ', capture=False), VOCAB.part('noise', r""" \w+ """, priority=LOWEST, capture=False), VOCAB.producer( convert, """ label? (?P<value> digits sep? month_name sep? digits ) """), VOCAB.producer( convert, """ label? (?P<value> month_name sep? digits sep? digits ) """), VOCAB.producer( convert, """ label? (?P<value> digits sep digits sep digits ) """), VOCAB.producer( short_date_digits, f""" label? (?P<value> digits sep digits ) """),
"""Parse testes size notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.reproductive import convert, double VOCAB = Vocabulary(patterns.VOCAB) TESTES_SIZE = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Note: abbrev differs from the one in the testes_state_trait VOCAB.term("abbrev", "tes ts tnd td tns ta".split()), # The abbreviation key, just: t. This can be a problem. VOCAB.part("char_key", r" \b t (?! [a-z] )"), # A key with units, like: gonadLengthInMM VOCAB.term( "key_with_units", r""" (?P<ambiguous_key> gonad ) \s* (?P<dim> length | len | width ) \s* in \s* (?P<len_units> millimeters | mm ) """, ), VOCAB.grouper( "value", """ cross | number len_units? (?! mass_units ) """, ),
import pandas as pd from traiter.old.vocabulary import LOWEST, Vocabulary from digi_leap.parsers.base import Base from digi_leap.pylib import const, patterns from digi_leap.pylib.trait import Trait PLANT_FAMILIES = const.DATA_DIR / 'itis_plant_families.csv' PLANT_GENERA = const.DATA_DIR / 'itis_plant_genera.csv' VOCAB = Vocabulary(patterns.VOCAB) VOCAB.part('word', r' \S+ ', capture=False, priority=LOWEST) DATA = pd.read_csv(PLANT_FAMILIES, na_filter=False, dtype=str) VOCAB.term('plant_family', DATA['complete_name'].tolist()) DATA = pd.read_csv(PLANT_GENERA, na_filter=False, dtype=str) VOCAB.term('plant_genus', DATA['complete_name'].tolist()) def convert(token): """Normalize a parsed taxon notation""" return Trait(start=token.start, end=token.end, value=token.group['value']) PLANT_TAXON = Base(name='plant_taxon', rules=[ VOCAB['eol'], VOCAB.producer(convert, f' (?P<value> plant_genus word+ ) ')
VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert parsed token into a trait producer.""" trait = Trait(value=token.group["value"].lower(), start=token.start, end=token.end) trait.is_flag_in_token(token, "ambiguous_key") return trait SCROTAL_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()), VOCAB.term("scrotal_abbrev", "ns sc".split()), # If possible exclude length. Ex: reproductive data=testes: 11x7 mm VOCAB.grouper("length", "cross len_units?"), VOCAB.producer( convert, """ (?P<value> ( testes | testes_abbrev ) non? ( scrotal | scrotal_abbrev ) ) """, ), VOCAB.producer(convert, """ (?P<value> non? scrotal ) """), VOCAB.producer(convert, """ label (?P<value> scrotal_abbrev ) """), ], )
trait.right = count return trait EMBRYO_COUNT = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers VOCAB["shorthand"], VOCAB["metric_mass"], VOCAB.part( "sex", r""" males? | females? | (?<! [a-z] ) [mf] (?! [a-z] ) """, ), VOCAB.term("repo_key", r""" reproductive \s data """), VOCAB.term("near_term", r" near[\s-]?term"), VOCAB.term("each_side", r" each \s side "), VOCAB.term("skip", r" w wt ".split()), VOCAB.part("sep", r" [;] "), VOCAB.part("bang", r" [!] "), VOCAB.grouper( "count", """ none (word | plac_scar) conj | integer | none | num_words | bang """, ), VOCAB.grouper("present", " found | near_term "), VOCAB.grouper("numeric", " integer | real "), VOCAB.grouper("skip_len", " ( x? numeric metric_len ) | (x numeric metric_len?) "), VOCAB.grouper("skip_words", " word | numeric | metric_len | eq "), VOCAB.grouper("side_link", " x | conj | word "),
"""Shared reproductive trait tokens (testes & ovaries).""" from traiter.old.vocabulary import LOWEST, Vocabulary import vertnet.pylib.patterns as patterns VOCAB = Vocabulary(patterns.VOCAB) VOCAB.term("sex", "females? | males? | [f]") VOCAB.term("active", "active inactive".split()) VOCAB.part("and", r" ( and \b | [&] ) ") VOCAB.term("count", r"""( only | all | both )? \s* [12]""") VOCAB.term( "color", r""" (( dark | light | pale ) \s* )? ( red | pink | brown | black | white | pigmented ) """, ) VOCAB.term("texture", " smooth ") VOCAB.term("covered", " covered ") VOCAB.term("destroyed", "destroy(ed)?") VOCAB.part( "size", r""" ( very \s+ )?
"""Patterns for names.""" import pandas as pd from traiter.old.vocabulary import Vocabulary from digi_leap.pylib import patterns from digi_leap.pylib.const import DATA_DIR NAME_CSV = DATA_DIR / 'name_parts.csv' SUFFIXES = 'filho ii iii jr sr'.split() VOCAB = Vocabulary(patterns.VOCAB) def build_name_parts(): """Build name patterns.""" df = pd.read_csv(NAME_CSV, na_filter=False, dtype=str) VOCAB.term('name_part', df['name'].tolist(), capture=False) build_name_parts() VOCAB.term('suffix', SUFFIXES) VOCAB.term('initial', r'[[:alpha:]] (?! \s* \d+ )')
VOCAB.part("comma", r" [,] ", capture=False, priority=LOWEST) VOCAB.part("semicolon", r" [;] ", capture=False, priority=LOWEST) VOCAB.part("colon", r" [:] ", capture=False, priority=LOWEST) VOCAB.part("ampersand", r" [&] ", capture=False) VOCAB.part("eq", r" [=] ", capture=False) VOCAB.part("plus", r" [+] ", capture=False) VOCAB.part("under", r" [_] ", capture=False) VOCAB.part("eol", r" [\n\r\f] ", capture=False) VOCAB.part("dot", r" [.] ", capture=False) # Small words VOCAB.part("by", r" by ", capture=False) VOCAB.part("to", r" to ", capture=False) VOCAB.part("with", r" with ", capture=False) VOCAB.part("up_to", r" ( up \s+ )? to ", capture=False) VOCAB.term("and", r" and ", capture=False) VOCAB.term("conj", " or and but ".split(), capture=False) VOCAB.term("prep", " to with on of in ".split(), capture=False) VOCAB.term("found", "found", capture=False) # NOTE: Double quotes as inches is handled elsewhere VOCAB.part( "inches", r""" (?<! [a-z] ) ( inch e? s? | in s? (?! [a-ru-wyz] ) ) (?! [:] ) """, ) VOCAB.part( "feet", r""" (?<! [a-z] ) ( foot s? (?! [:] ) | feet s? (?! [:] ) | ft s? (?! [,\w]) ) | (?<= \d ) ' """,
def fix_up(trait, text): """Fix problematic parses.""" # Try to disambiguate doubles quotes from inches return fix_up_inches(trait, text) FOREARM_LENGTH = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Units are in the key, like: ForearmLengthInMillimeters VOCAB.term( "key_with_units", r"""( forearm \s* )? \s* ( length | len ) \s* in \s* (?P<units> millimeters | mm ) """, ), # Standard keywords that indicate a forearm length follows VOCAB.term( "key", r""" forearm ( \s* ( length | len | l ) )? | fore? \s? [.]? \s? a | fa """, ), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False), VOCAB.grouper("noise", " word dash ".split()), # Handle fractional values like: forearm 9/16"
return None # Try to disambiguate doubles quotes from inches return fix_up_inches(trait, text) TAIL_LENGTH = Base( name=__name__.split(".")[-1], fix_up=fix_up, rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Looking for keys like: tailLengthInMM VOCAB.term( "key_with_units", r""" tail \s* ( length | len ) \s* in \s* (?P<units> millimeters | mm ) """, ), # The abbreviation key, just: t. This can be a problem. VOCAB.part( "char_key", r""" \b (?P<ambiguous_key> t ) (?! [a-z] ) (?! _ \D ) """, ), # Standard keywords that indicate a tail length follows VOCAB.term("keyword", [r" tail \s* length ", r" tail \s* len ", "tail", "tal"]), # Some patterns require a separator VOCAB.part("sep", r" [;,] | $ ", capture=False),
import vertnet.pylib.patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) TIME_OPTIONS = VOCAB["time_units"].pattern LIFE_STAGE = Base( name=__name__.split(".")[-1], rules=[ # JSON keys for life stage VOCAB.term( "json_key", [ r" life \s* stage \s* (remarks?)? ", r" age \s* class ", r" age \s* in \s* (?P<time_units> {}) ".format(TIME_OPTIONS), r" age ", ], ), # These words are life stages without a keyword indicator VOCAB.term( "intrinsic", [ r" yolk \s? sac ", r" young [\s-]? of [\s-]? the [\s-]? year ", r" adult \s* young ", r" young \s* adult ", ] + """ ads? adulte?s? chicks?