"""Parse ovaries size notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.reproductive import convert, double VOCAB = Vocabulary(patterns.VOCAB) OVARY_SIZE = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # A key with units, like: gonadLengthInMM VOCAB.term( "key_with_units", r""" (?P<ambiguous_key> gonad ) \s* (?P<dim> length | len | width ) \s* in \s* (?P<len_units> millimeters | mm ) """, ), VOCAB.grouper("value", " cross | number len_units? "), # E.g.: active, Or: immature VOCAB.grouper("state", "active mature destroyed visible developed".split()), # Male or female ambiguous, like: gonadLength1 VOCAB.grouper( "ambiguous", """
"""Parse sex notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) SEX = Base( name=__name__.split(".")[-1], rules=[ # JSON keys for sex VOCAB.term("sex_key", "sex"), # The sexes VOCAB.term("sex_vocab", "females? males?".split()), # These are words that indicate that "sex" is not a key VOCAB.term("not_sex", "and is was".split()), # Allow arbitrary words in some cases VOCAB.part("word", r' \b [a-z] [^;,"=:\s]* '), # Some patterns need a terminator VOCAB.part("separator", ' [;,"] | $ '), # E.g.: sex might be female; VOCAB.producer( convert, """ sex_key (?P<value> ( sex_vocab | word ){1,2} quest? ) separator """, ), # E.g.: sex=female?, Or: sex=unknown VOCAB.producer(convert, " sex_key (?P<value> ( sex_vocab | word ) quest? ) "), # E.g.: male, Or: male?
"""Parse nipple state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.trait import Trait VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert parsed token into a trait.""" trait = Trait( value="enlarged" if token.group.get("pos") else "not enlarged", start=token.start, end=token.end, ) return trait NIPPLES_ENLARGED = Base( name=__name__.split(".")[-1], rules=[ VOCAB["conj"], VOCAB.part("separator", r' [;"?/,] '), VOCAB.term("enlarged_abbrev", r"[oc]e[ln]"), VOCAB.term("not_enlarged_abbrev", r"[oc]s[ln]"), VOCAB.term("false", """ false """), VOCAB.producer(convert, """ (?P<pos> nipple enlarged ) """),
"""Parse testes state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.trait import Trait VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert parsed token into a trait producer.""" trait = Trait(value=token.group["value"].lower(), start=token.start, end=token.end) trait.is_flag_in_token(token, "ambiguous_key") return trait SCROTAL_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()), VOCAB.term("scrotal_abbrev", "ns sc".split()), # If possible exclude length. Ex: reproductive data=testes: 11x7 mm VOCAB.grouper("length", "cross len_units?"), VOCAB.producer( convert, """ (?P<value> ( testes | testes_abbrev ) non? ( scrotal | scrotal_abbrev ) )
"""Parse v****a state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) VAGINA_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part("v****a", r""" (?<! sal ) ( v****a | vag | vulva ) """), VOCAB.term("abbrev", r""" ov cv [oc][sme][ln] vc vo """.split()), VOCAB.part( "closed", r""" closed | imperforated | imperf | cerrada | non [-\s] perforated | unperforate | non [-\s] perf | clsd | imp """, ), VOCAB.part("open", r""" open | perforated? | perf | abrir """), VOCAB.part("other", r""" swollen | plugged | plug | sealed """), VOCAB.grouper("state", """ closed | open | other """), VOCAB.producer(convert, """ (?P<value> v****a partially? state ) """), VOCAB.producer(convert, """ (?P<value> state v****a state? ) """), VOCAB.producer(convert, """ (?P<value> ( state | abbrev ) v****a? ) """), ], )
"""Parse ovaries state notations.""" import regex from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.trait import Trait VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert parsed token into a trait.""" value = token.group["value"].lower() if regex.match(r"^[\s\d]+$", value): return None trait = Trait(value=value, start=token.start, end=token.end) trait.is_flag_in_token(token, "ambiguous_key") trait.is_value_in_token(token, "side") return trait def double(token): """Convert a single token into two traits.""" trait1 = Trait( value=token.group["value"][0].lower(), side=token.group["side"][0].lower(), start=token.start, end=token.end, )
"""Parse testes size notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.reproductive import convert, double VOCAB = Vocabulary(patterns.VOCAB) TESTES_SIZE = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Note: abbrev differs from the one in the testes_state_trait VOCAB.term("abbrev", "tes ts tnd td tns ta".split()), # The abbreviation key, just: t. This can be a problem. VOCAB.part("char_key", r" \b t (?! [a-z] )"), # A key with units, like: gonadLengthInMM VOCAB.term( "key_with_units", r""" (?P<ambiguous_key> gonad ) \s* (?P<dim> length | len | width ) \s* in \s* (?P<len_units> millimeters | mm ) """, ), VOCAB.grouper( "value", """ cross | number len_units? (?! mass_units ) """, ),
from functools import partial import regex from traiter.old.vocabulary import Vocabulary import vertnet.pylib.patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.numeric import ( fraction, numeric_fix_ups, shorthand_length, simple_len, ) from vertnet.pylib.util import FLAGS VOCAB = Vocabulary(patterns.VOCAB) # How far to look into the surrounding context to disambiguate the parse LOOK_BACK_FAR = 40 LOOK_BACK_NEAR = 10 # These indicate that the parse is not really for an ear length IS_ET = regex.compile(r" e \.? t ", FLAGS) IS_NUMBER = regex.compile(" [#] ", FLAGS) IS_MAG = regex.compile(" magnemite ", FLAGS) IS_ID = regex.compile(" identifier | ident | id ", FLAGS) # The 'E' abbreviation gets confused with abbreviation for East sometimes. # Try to disambiguate the two by looking for a North near by. LOOK_AROUND = 10 IS_EAST = regex.compile(r" \b n ", FLAGS)
"""Parse date notations.""" from calendar import IllegalMonthError from datetime import date import regex from dateutil import parser from dateutil.relativedelta import relativedelta from traiter.old.vocabulary import LOWEST, Vocabulary from digi_leap.parsers.base import Base from digi_leap.pylib import patterns from digi_leap.pylib.trait import Trait VOCAB = Vocabulary(patterns.VOCAB) YEAR_LEN = 2 def convert(token): """Normalize a parsed date.""" trait = Trait(start=token.start, end=token.end) value = regex.sub(r'[^a-z\d]+', '-', token.group['value'], flags=regex.I | regex.X) if len(value) < 4: return None
"""Patterns for US states.""" import pandas as pd import regex from traiter.old.vocabulary import Vocabulary from digi_leap.pylib import const, patterns STATE_CSV = const.DATA_DIR / 'US_states.csv' STATES = {} STATE_NAMES = [] NORMALIZE_US_STATE = {} VOCAB = Vocabulary(patterns.VOCAB) VOCAB.term( 'USA', r""" U\.?S\.?A\.? | U\.?S\.? | United \s? States \s? of \s? America | United \s? States | U\.? \s? of \s? A\.?""") def normalize_key(state: str) -> str: """Convert state abbreviations into a consistent key.""" return regex.sub(r'[^a-z]+', '', state.lower()) def normalize_state(state: str) -> str: """Convert state abbreviations to the state name.""" return NORMALIZE_US_STATE.get(normalize_key(state), state.title())
"""Parse hind foot length notations.""" from functools import partial from traiter.old.vocabulary import Vocabulary import vertnet.pylib.patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.numeric import fix_up_inches, fraction, shorthand_length, simple VOCAB = Vocabulary(patterns.VOCAB) def fix_up(trait, text): """Fix problematic parses.""" # Try to disambiguate doubles quotes from inches return fix_up_inches(trait, text) HIND_FOOT_LENGTH = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Units are in the key, like: HindFootLengthInMillimeters VOCAB.term( "key_with_units", r"""( hind \s* )? foot \s* ( length | len ) \s* in \s* (?P<units> millimeters | mm ) """, ), # Standard keywords that indicate a hind foot length follows
"""Find collector notations on herbarium specimen labels.""" from itertools import zip_longest import regex from traiter.old.vocabulary import LOWEST, Vocabulary from traiter.util import squash from digi_leap.parsers import name_parts from digi_leap.parsers.base import Base from digi_leap.parsers.us_states import STATE_NAMES from digi_leap.pylib.trait import Trait VOCAB = Vocabulary(name_parts.VOCAB) MIN_LEN = 5 # Minimum collector name length def convert(token): """Build a collector trait""" names = regex.split(r'\s*(?:and|with|[,&])\s*', token.group.get('col_name')) traits = [] for name, suffix in zip_longest(names, names[1:], fillvalue=''): name = regex.sub(r'\.{3,}.*', '', name) if len(name) < MIN_LEN: continue trait = Trait(start=token.start, end=token.end)
"""Parse total length notations.""" from functools import partial import regex from traiter.old.vocabulary import Vocabulary import vertnet.pylib.numeric as numeric import vertnet.pylib.patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.numeric import compound, fix_up_inches, fraction from vertnet.pylib.util import FLAGS VOCAB = Vocabulary(patterns.VOCAB) # How far to look into the surrounding context to disambiguate the parse LOOK_BACK_FAR = 40 LOOK_BACK_NEAR = 10 # These indicate that the parse is not a total length IS_ID = regex.compile(" identifier | ident | id | collector ", FLAGS) IS_TRAP = regex.compile(" trap ", FLAGS) IS_TESTES = regex.compile( " reproductive | gonad | test | scrotal | scrotum | scrot ", FLAGS) # The 'L' abbreviation gets confused with abbreviation for Left sometimes. # Try to disambiguate the two by looking for a Right near by. LOOK_AROUND = 10 IS_LEFT = regex.compile(r" \b r \b ", FLAGS)
"""Parse tragus length notations.""" from functools import partial from traiter.old.vocabulary import Vocabulary import vertnet.pylib.patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.numeric import fix_up_inches, fraction, shorthand_length, simple VOCAB = Vocabulary(patterns.VOCAB) def fix_up(trait, text): """Fix problematic parses.""" # Try to disambiguate doubles quotes from inches return fix_up_inches(trait, text) TRAGUS_LENGTH = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Units are in the key, like: tragusLengthInMillimeters VOCAB.term( "key_with_units", r"""( tragus \s* ) \s* ( length | len ) \s* in \s* (?P<units> millimeters | mm ) """, ), # Standard keywords that indicate a tragus length follows VOCAB.term(
"""Parse body mass notations.""" from traiter.old.vocabulary import Vocabulary from traiter.util import as_list, squash, to_positive_float import vertnet.pylib.patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.convert_units import convert_units from vertnet.pylib.numeric import add_flags, as_value, simple_mass from vertnet.pylib.trait import Trait VOCAB = Vocabulary(patterns.VOCAB) def shorthand(token): """Convert a shorthand value like 11-22-33-44:55g.""" trait = Trait(start=token.start, end=token.end) flag = as_value(token, trait, "shorthand_wt", "shorthand_wt_units") trait.is_flag_in_token(token, "estimated_wt", rename="estimated_value") trait.is_shorthand = True return trait if flag else None def compound(token): """Convert a compound weight like: 2 lbs. 3.1 - 4.5 oz.""" trait = Trait(start=token.start, end=token.end) trait.units = [token.group["pounds"], token.group["ounces"]] trait.units_inferred = False trait.is_flag_missing(token, "key", rename="ambiguous_key") lbs = convert_units(to_positive_float(token.group["lbs"]), "lbs") ozs = [
"""Parse lactation state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base, convert VOCAB = Vocabulary(patterns.VOCAB) LACTATION_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part( "lactating", r""" ( lactating | lactation | lactated | lactate | lact | lactaing | lactacting | lactataing | lactational | oelact | celact | lactati | lactacting | lactatin | lactatting | lactatng | nursing | suckling ) \b """, ), VOCAB.part("not", r" \b ( not | non | no ) "), VOCAB.part( "post", r""" \b ( (( just | recently ) \s+ )? finished | post | recently | recent | had | pre ) """, ),
"""Parse placental scar counts.""" from traiter.old.vocabulary import Vocabulary from traiter.util import as_list import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.trait import Trait from vertnet.pylib.util import to_positive_int VOCAB = Vocabulary(patterns.VOCAB) SUB = {"l": "left", "r": "right", "m": "male", "f": "female"} def convert_count(token): """Convert parsed tokens into a result.""" trait = Trait(start=token.start, end=token.end) trait.value = to_positive_int(token.group.get("value")) count1 = to_positive_int(token.group.get("count1")) count2 = to_positive_int(token.group.get("count2")) side1 = SUB.get(token.group.get("side1", " ").lower()[0], "side1") side2 = SUB.get(token.group.get("side2", " ").lower()[0], "side2") if not trait.value: trait.value = count1 + count2 if count1 or side1 != "side1": setattr(trait, side1, count1)
"""Find taxon notations on herbarium specimen labels.""" import pandas as pd from traiter.old.vocabulary import LOWEST, Vocabulary from digi_leap.parsers.base import Base from digi_leap.pylib import const, patterns from digi_leap.pylib.trait import Trait PLANT_FAMILIES = const.DATA_DIR / 'itis_plant_families.csv' PLANT_GENERA = const.DATA_DIR / 'itis_plant_genera.csv' VOCAB = Vocabulary(patterns.VOCAB) VOCAB.part('word', r' \S+ ', capture=False, priority=LOWEST) DATA = pd.read_csv(PLANT_FAMILIES, na_filter=False, dtype=str) VOCAB.term('plant_family', DATA['complete_name'].tolist()) DATA = pd.read_csv(PLANT_GENERA, na_filter=False, dtype=str) VOCAB.term('plant_genus', DATA['complete_name'].tolist()) def convert(token): """Normalize a parsed taxon notation""" return Trait(start=token.start, end=token.end, value=token.group['value']) PLANT_TAXON = Base(name='plant_taxon', rules=[ VOCAB['eol'], VOCAB.producer(convert,
"""Shared token patterns.""" from traiter.old.vocabulary import FIRST, LOWEST, Vocabulary VOCAB = Vocabulary() # Chars that may be a token VOCAB.part('slash', r' [/] ', capture=False) VOCAB.part('dash', r' (?: – | - ) ', capture=False) VOCAB.part('open', r' [(\[] ', capture=False) VOCAB.part('close', r' [)\]] ', capture=False) VOCAB.part('x', r' [x×] ', capture=False) VOCAB.part('quest', r' [?] ') VOCAB.part('comma', r' [,] ', capture=False, priority=LOWEST) VOCAB.part('semicolon', r' [;] ', capture=False, priority=LOWEST) VOCAB.part('ampersand', r' [&] ', capture=False) VOCAB.part('eq', r' [=] ', capture=False) VOCAB.part('under', r' [_] ', capture=False) VOCAB.part('eol', r' [\n\r\f] ', capture=False) VOCAB.part('dot', r' [.] ', capture=False) # Small words VOCAB.part('by', r' by ', capture=False) VOCAB.part('to', r' to ', capture=False) VOCAB.part('with', r' with ', capture=False) VOCAB.part('up_to', r' ( up \s+ )? to ', capture=False) VOCAB.term('and', r' and ', capture=False) VOCAB.term('conj', ' or and '.split(), capture=False) VOCAB.term('prep', ' to with on of '.split(), capture=False) VOCAB.term('word', r' [a-z] \w* ', capture=False, priority=LOWEST)
"""Parse embryo counts.""" from traiter.old.vocabulary import Vocabulary from traiter.util import as_list import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.trait import Trait from vertnet.pylib.util import to_positive_int VOCAB = Vocabulary(patterns.VOCAB) SUB = {"l": "left", "r": "right", "m": "male", "f": "female"} def convert(token): """Convert parsed tokens into a result.""" trait = Trait(start=token.start, end=token.end) if token.group.get("total"): trait.value = to_positive_int(token.group["total"]) if token.group.get("subcount"): trait.value = sum( to_positive_int(c) for c in as_list(token.group["subcount"])) if token.group.get("subcount") and token.group.get("sub"): for count, sub in zip(as_list(token.group["subcount"]), as_list(token.group.get("sub"))): count = "1" if count == "!" else count sub = SUB.get(sub[0].lower(), sub)
"""Parse lactation state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.trait import Trait VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert parsed token into a trait.""" trait = Trait( value="lactating" if token.group.get("pos") else "not lactating", start=token.start, end=token.end, ) return trait LACTATION_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.part( "lactating", r""" ( lactating | lactation | lactated | lactate | lact | lactaing | lactacting | lactataing | lactational | oelact | celact | lactati | lactacting | lactatin | lactatting | lactatng
"""Patterns for names.""" import pandas as pd from traiter.old.vocabulary import Vocabulary from digi_leap.pylib import patterns from digi_leap.pylib.const import DATA_DIR NAME_CSV = DATA_DIR / 'name_parts.csv' SUFFIXES = 'filho ii iii jr sr'.split() VOCAB = Vocabulary(patterns.VOCAB) def build_name_parts(): """Build name patterns.""" df = pd.read_csv(NAME_CSV, na_filter=False, dtype=str) VOCAB.term('name_part', df['name'].tolist(), capture=False) build_name_parts() VOCAB.term('suffix', SUFFIXES) VOCAB.term('initial', r'[[:alpha:]] (?! \s* \d+ )')
"""Patterns for US counties.""" from collections import defaultdict import pandas as pd from traiter.old.vocabulary import Vocabulary from digi_leap.parsers import us_states from digi_leap.pylib import const COUNTY_CSV = const.DATA_DIR / 'US_counties.csv' VOCAB = Vocabulary(us_states.VOCAB) def build_counties(): """Read the CSV file and build counties by state.""" counties = defaultdict(list) df = pd.read_csv(COUNTY_CSV, na_filter=False, dtype=str) for _, row in df.iterrows(): counties[row['State']].append(row['County']) us_county = [] for abbrev in us_states.STATES.values(): names = [] for name in [n for n in counties[abbrev] if n not in us_states.STATES]: name = name.replace('.', r'\.?') name = name.replace("'", "'?") name = name.replace(' ', r'\s?') name = name.replace('-', r'[\s-]?') names.append(name)
"""Parse embryo lengths.""" from traiter.old.vocabulary import Vocabulary from traiter.util import as_list, to_positive_float import vertnet.pylib.convert_units as convert_units import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.numeric import add_flags, fix_up_inches, simple from vertnet.pylib.trait import Trait VOCAB = Vocabulary(patterns.VOCAB) TOO_BIG = 1000 def convert(token): """Convert parsed token into a trait product.""" trait = simple(token, units="len_units") return trait if all(x < TOO_BIG for x in as_list(trait.value)) else None def isolate(token): """Convert parsed token into a trait product.""" token.group["number"] = [ v.strip() for v in token.group["value"].split("x") ] return convert(token) def convert_many(token):
"""Shared reproductive trait tokens (testes & ovaries).""" from traiter.old.vocabulary import LOWEST, Vocabulary import vertnet.pylib.patterns as patterns VOCAB = Vocabulary(patterns.VOCAB) VOCAB.term("sex", "females? | males? | [f]") VOCAB.term("active", "active inactive".split()) VOCAB.part("and", r" ( and \b | [&] ) ") VOCAB.term("count", r"""( only | all | both )? \s* [12]""") VOCAB.term( "color", r""" (( dark | light | pale ) \s* )? ( red | pink | brown | black | white | pigmented ) """, ) VOCAB.term("texture", " smooth ") VOCAB.term("covered", " covered ") VOCAB.term("destroyed", "destroy(ed)?") VOCAB.part( "size", r""" ( very \s+ )?
"""Parse forearm length notations.""" from functools import partial from traiter.old.vocabulary import Vocabulary import vertnet.pylib.patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.numeric import fix_up_inches, fraction, shorthand_length, simple VOCAB = Vocabulary(patterns.VOCAB) def fix_up(trait, text): """Fix problematic parses.""" # Try to disambiguate doubles quotes from inches return fix_up_inches(trait, text) FOREARM_LENGTH = Base( name=__name__.split(".")[-1], rules=[ VOCAB["uuid"], # UUIDs cause problems with numbers # Units are in the key, like: ForearmLengthInMillimeters VOCAB.term( "key_with_units", r"""( forearm \s* )? \s* ( length | len ) \s* in \s* (?P<units> millimeters | mm ) """, ), # Standard keywords that indicate a forearm length follows
"""Shared token patterns.""" from traiter.old.vocabulary import FIRST, LOWEST, Vocabulary from vertnet.pylib.util import NUM_WORDS, ORDINALS VOCAB = Vocabulary() # Chars that may be a token VOCAB.part("slash", r" [/] ", capture=False) VOCAB.part("dash", r" \p{Pd} ", capture=False) VOCAB.part("open", r" \p{Ps} ", capture=False) VOCAB.part("close", r" \p{Pe} ", capture=False) VOCAB.part("x", r" [x×] ", capture=False) VOCAB.part("quest", r" [?] ") VOCAB.part("comma", r" [,] ", capture=False, priority=LOWEST) VOCAB.part("semicolon", r" [;] ", capture=False, priority=LOWEST) VOCAB.part("colon", r" [:] ", capture=False, priority=LOWEST) VOCAB.part("ampersand", r" [&] ", capture=False) VOCAB.part("eq", r" [=] ", capture=False) VOCAB.part("plus", r" [+] ", capture=False) VOCAB.part("under", r" [_] ", capture=False) VOCAB.part("eol", r" [\n\r\f] ", capture=False) VOCAB.part("dot", r" [.] ", capture=False) # Small words VOCAB.part("by", r" by ", capture=False) VOCAB.part("to", r" to ", capture=False) VOCAB.part("with", r" with ", capture=False) VOCAB.part("up_to", r" ( up \s+ )? to ", capture=False) VOCAB.term("and", r" and ", capture=False)
"""Parse lactation state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.trait import Trait from vertnet.pylib.util import to_positive_int VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert single value tokens into a result.""" value = token.group.get("value") if not value: return None trait = Trait(start=token.start, end=token.end) trait.value = to_positive_int(value) if trait.value > 100: return None if token.group.get("notation"): trait.notation = token.group["notation"] return trait
"""Parse tail length notations.""" from functools import partial import regex from traiter.old.vocabulary import Vocabulary import vertnet.pylib.patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.numeric import fix_up_inches, fraction, shorthand_length, simple from vertnet.pylib.util import FLAGS VOCAB = Vocabulary(patterns.VOCAB) # How far to look into the surrounding context to disambiguate the parse LOOK_BACK_FAR = 40 LOOK_BACK_NEAR = 20 # These indicate that the parse is not really for a tail length IS_TESTES = regex.compile( " reproductive | gonad | test | scrotal | scrotum | scrot ", FLAGS) IS_ELEVATION = regex.compile(" elevation | elev ", FLAGS) IS_TOTAL = regex.compile(" body | nose | snout ", FLAGS) IS_TAG = regex.compile(" tag ", FLAGS) IS_ID = regex.compile(" identifier | ident | id ", FLAGS) def fix_up(trait, text): """Fix problematic parses.""" # Check that this isn't a total length trait start = max(0, trait.start - LOOK_BACK_NEAR)
"""Parse testes state notations.""" from traiter.old.vocabulary import Vocabulary import vertnet.pylib.shared_reproductive_patterns as patterns from vertnet.parsers.base import Base from vertnet.pylib.trait import Trait VOCAB = Vocabulary(patterns.VOCAB) def convert(token): """Convert parsed token into a trait producer.""" trait = Trait( value="scrotal" if token.group.get("pos") else "not scrotal", start=token.start, end=token.end, ) return trait SCROTAL_STATE = Base( name=__name__.split(".")[-1], rules=[ VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()), VOCAB.term("scrotal_abbrev_pos", "sc".split()), VOCAB.term("scrotal_abbrev_neg", "ns ".split()), # If possible exclude length. Ex: reproductive data=testes: 11x7 mm VOCAB.grouper("length", "cross len_units?"), VOCAB.producer(convert, """ (?P<pos> scrotal_pos ) """),