Exemplo n.º 1
0
"""Parse ovaries size notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.reproductive import convert, double

VOCAB = Vocabulary(patterns.VOCAB)

OVARY_SIZE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # A key with units, like: gonadLengthInMM
        VOCAB.term(
            "key_with_units",
            r"""
                (?P<ambiguous_key> gonad ) \s*
                    (?P<dim> length | len | width ) \s* in \s*
                    (?P<len_units> millimeters | mm )
            """,
        ),
        VOCAB.grouper("value", " cross | number len_units? "),
        # E.g.: active, Or: immature
        VOCAB.grouper("state",
                      "active mature destroyed visible developed".split()),
        # Male or female ambiguous, like: gonadLength1
        VOCAB.grouper(
            "ambiguous",
            """
Exemplo n.º 2
0
"""Parse sex notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

SEX = Base(
    name=__name__.split(".")[-1],
    rules=[
        # JSON keys for sex
        VOCAB.term("sex_key", "sex"),
        # The sexes
        VOCAB.term("sex_vocab", "females? males?".split()),
        # These are words that indicate that "sex" is not a key
        VOCAB.term("not_sex", "and is was".split()),
        # Allow arbitrary words in some cases
        VOCAB.part("word", r' \b [a-z] [^;,"=:\s]* '),
        # Some patterns need a terminator
        VOCAB.part("separator", ' [;,"] | $ '),
        # E.g.: sex might be female;
        VOCAB.producer(
            convert,
            """ sex_key (?P<value> ( sex_vocab | word ){1,2} quest? ) separator """,
        ),
        # E.g.: sex=female?, Or: sex=unknown
        VOCAB.producer(convert,
                       " sex_key (?P<value> ( sex_vocab | word ) quest? ) "),
        # E.g.: male, Or: male?
"""Parse nipple state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait."""
    trait = Trait(
        value="enlarged" if token.group.get("pos") else "not enlarged",
        start=token.start,
        end=token.end,
    )
    return trait


NIPPLES_ENLARGED = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["conj"],
        VOCAB.part("separator", r' [;"?/,] '),
        VOCAB.term("enlarged_abbrev", r"[oc]e[ln]"),
        VOCAB.term("not_enlarged_abbrev", r"[oc]s[ln]"),
        VOCAB.term("false", """ false """),

        VOCAB.producer(convert, """ (?P<pos> nipple enlarged ) """),
Exemplo n.º 4
0
"""Parse testes state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait producer."""
    trait = Trait(value=token.group["value"].lower(),
                  start=token.start,
                  end=token.end)
    trait.is_flag_in_token(token, "ambiguous_key")
    return trait


SCROTAL_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()),
        VOCAB.term("scrotal_abbrev", "ns sc".split()),
        # If possible exclude length. Ex: reproductive data=testes: 11x7 mm
        VOCAB.grouper("length", "cross len_units?"),
        VOCAB.producer(
            convert,
            """ (?P<value>
                ( testes | testes_abbrev ) non? ( scrotal | scrotal_abbrev ) )
Exemplo n.º 5
0
"""Parse v****a state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

VAGINA_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part("v****a", r""" (?<! sal ) ( v****a | vag | vulva ) """),
        VOCAB.term("abbrev", r""" ov cv [oc][sme][ln] vc vo """.split()),
        VOCAB.part(
            "closed",
            r"""
                closed | imperforated | imperf | cerrada | non [-\s] perforated
                | unperforate | non  [-\s] perf | clsd | imp
            """,
        ),
        VOCAB.part("open", r""" open | perforated? | perf | abrir """),
        VOCAB.part("other", r""" swollen | plugged | plug | sealed """),
        VOCAB.grouper("state", """ closed | open | other """),
        VOCAB.producer(convert, """ (?P<value> v****a partially? state ) """),
        VOCAB.producer(convert, """ (?P<value> state v****a state? ) """),
        VOCAB.producer(convert,
                       """ (?P<value> ( state | abbrev )  v****a? ) """),
    ],
)
Exemplo n.º 6
0
"""Parse ovaries state notations."""

import regex
from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait."""
    value = token.group["value"].lower()
    if regex.match(r"^[\s\d]+$", value):
        return None
    trait = Trait(value=value, start=token.start, end=token.end)
    trait.is_flag_in_token(token, "ambiguous_key")
    trait.is_value_in_token(token, "side")
    return trait


def double(token):
    """Convert a single token into two traits."""
    trait1 = Trait(
        value=token.group["value"][0].lower(),
        side=token.group["side"][0].lower(),
        start=token.start,
        end=token.end,
    )
Exemplo n.º 7
0
"""Parse testes size notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.reproductive import convert, double

VOCAB = Vocabulary(patterns.VOCAB)

TESTES_SIZE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Note: abbrev differs from the one in the testes_state_trait
        VOCAB.term("abbrev", "tes ts tnd td tns ta".split()),
        # The abbreviation key, just: t. This can be a problem.
        VOCAB.part("char_key", r" \b t (?! [a-z] )"),
        # A key with units, like: gonadLengthInMM
        VOCAB.term(
            "key_with_units",
            r"""
                (?P<ambiguous_key> gonad ) \s*
                    (?P<dim> length | len | width ) \s* in \s*
                    (?P<len_units> millimeters | mm )
            """,
        ),
        VOCAB.grouper(
            "value",
            """ cross | number len_units? (?! mass_units ) """,
        ),
Exemplo n.º 8
0
from functools import partial

import regex
from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import (
    fraction,
    numeric_fix_ups,
    shorthand_length,
    simple_len,
)
from vertnet.pylib.util import FLAGS

VOCAB = Vocabulary(patterns.VOCAB)

# How far to look into the surrounding context to disambiguate the parse
LOOK_BACK_FAR = 40
LOOK_BACK_NEAR = 10

# These indicate that the parse is not really for an ear length
IS_ET = regex.compile(r" e \.? t ", FLAGS)
IS_NUMBER = regex.compile(" [#] ", FLAGS)
IS_MAG = regex.compile(" magnemite ", FLAGS)
IS_ID = regex.compile(" identifier | ident | id ", FLAGS)

# The 'E' abbreviation gets confused with abbreviation for East sometimes.
# Try to disambiguate the two by looking for a North near by.
LOOK_AROUND = 10
IS_EAST = regex.compile(r" \b n ", FLAGS)
Exemplo n.º 9
0
"""Parse date notations."""

from calendar import IllegalMonthError
from datetime import date

import regex
from dateutil import parser
from dateutil.relativedelta import relativedelta
from traiter.old.vocabulary import LOWEST, Vocabulary

from digi_leap.parsers.base import Base
from digi_leap.pylib import patterns
from digi_leap.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)

YEAR_LEN = 2


def convert(token):
    """Normalize a parsed date."""
    trait = Trait(start=token.start, end=token.end)

    value = regex.sub(r'[^a-z\d]+',
                      '-',
                      token.group['value'],
                      flags=regex.I | regex.X)

    if len(value) < 4:
        return None
Exemplo n.º 10
0
"""Patterns for US states."""

import pandas as pd
import regex
from traiter.old.vocabulary import Vocabulary

from digi_leap.pylib import const, patterns

STATE_CSV = const.DATA_DIR / 'US_states.csv'
STATES = {}
STATE_NAMES = []
NORMALIZE_US_STATE = {}

VOCAB = Vocabulary(patterns.VOCAB)

VOCAB.term(
    'USA', r"""
    U\.?S\.?A\.? | U\.?S\.?
    | United \s? States \s? of \s? America | United \s? States
    | U\.? \s? of \s? A\.?""")


def normalize_key(state: str) -> str:
    """Convert state abbreviations into a consistent key."""
    return regex.sub(r'[^a-z]+', '', state.lower())


def normalize_state(state: str) -> str:
    """Convert state abbreviations to the state name."""
    return NORMALIZE_US_STATE.get(normalize_key(state), state.title())
Exemplo n.º 11
0
"""Parse hind foot length notations."""

from functools import partial

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import fix_up_inches, fraction, shorthand_length, simple

VOCAB = Vocabulary(patterns.VOCAB)


def fix_up(trait, text):
    """Fix problematic parses."""
    # Try to disambiguate doubles quotes from inches
    return fix_up_inches(trait, text)


HIND_FOOT_LENGTH = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Units are in the key, like: HindFootLengthInMillimeters
        VOCAB.term(
            "key_with_units",
            r"""( hind \s* )? foot \s* ( length | len ) \s* in \s*
                    (?P<units> millimeters | mm )
            """,
        ),
        # Standard keywords that indicate a hind foot length follows
Exemplo n.º 12
0
"""Find collector notations on herbarium specimen labels."""

from itertools import zip_longest

import regex
from traiter.old.vocabulary import LOWEST, Vocabulary
from traiter.util import squash

from digi_leap.parsers import name_parts
from digi_leap.parsers.base import Base
from digi_leap.parsers.us_states import STATE_NAMES
from digi_leap.pylib.trait import Trait

VOCAB = Vocabulary(name_parts.VOCAB)

MIN_LEN = 5  # Minimum collector name length


def convert(token):
    """Build a collector trait"""
    names = regex.split(r'\s*(?:and|with|[,&])\s*',
                        token.group.get('col_name'))

    traits = []

    for name, suffix in zip_longest(names, names[1:], fillvalue=''):
        name = regex.sub(r'\.{3,}.*', '', name)
        if len(name) < MIN_LEN:
            continue

        trait = Trait(start=token.start, end=token.end)
Exemplo n.º 13
0
"""Parse total length notations."""

from functools import partial

import regex
from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.numeric as numeric
import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import compound, fix_up_inches, fraction
from vertnet.pylib.util import FLAGS

VOCAB = Vocabulary(patterns.VOCAB)

# How far to look into the surrounding context to disambiguate the parse
LOOK_BACK_FAR = 40
LOOK_BACK_NEAR = 10

# These indicate that the parse is not a total length
IS_ID = regex.compile(" identifier | ident | id | collector ", FLAGS)
IS_TRAP = regex.compile(" trap ", FLAGS)
IS_TESTES = regex.compile(
    " reproductive | gonad | test | scrotal | scrotum | scrot ", FLAGS)

# The 'L' abbreviation gets confused with abbreviation for Left sometimes.
# Try to disambiguate the two by looking for a Right near by.
LOOK_AROUND = 10
IS_LEFT = regex.compile(r" \b r \b ", FLAGS)

Exemplo n.º 14
0
"""Parse tragus length notations."""

from functools import partial

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import fix_up_inches, fraction, shorthand_length, simple

VOCAB = Vocabulary(patterns.VOCAB)


def fix_up(trait, text):
    """Fix problematic parses."""
    # Try to disambiguate doubles quotes from inches
    return fix_up_inches(trait, text)


TRAGUS_LENGTH = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Units are in the key, like: tragusLengthInMillimeters
        VOCAB.term(
            "key_with_units",
            r"""( tragus \s* ) \s* ( length | len ) \s* in \s*
                    (?P<units> millimeters | mm ) """,
        ),
        # Standard keywords that indicate a tragus length follows
        VOCAB.term(
Exemplo n.º 15
0
"""Parse body mass notations."""

from traiter.old.vocabulary import Vocabulary
from traiter.util import as_list, squash, to_positive_float

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.convert_units import convert_units
from vertnet.pylib.numeric import add_flags, as_value, simple_mass
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def shorthand(token):
    """Convert a shorthand value like 11-22-33-44:55g."""
    trait = Trait(start=token.start, end=token.end)
    flag = as_value(token, trait, "shorthand_wt", "shorthand_wt_units")
    trait.is_flag_in_token(token, "estimated_wt", rename="estimated_value")
    trait.is_shorthand = True
    return trait if flag else None


def compound(token):
    """Convert a compound weight like: 2 lbs. 3.1 - 4.5 oz."""
    trait = Trait(start=token.start, end=token.end)
    trait.units = [token.group["pounds"], token.group["ounces"]]
    trait.units_inferred = False
    trait.is_flag_missing(token, "key", rename="ambiguous_key")
    lbs = convert_units(to_positive_float(token.group["lbs"]), "lbs")
    ozs = [
"""Parse lactation state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

LACTATION_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part(
            "lactating",
            r""" (
                lactating | lactation | lactated | lactate | lact
                | lactaing | lactacting | lactataing | lactational
                | oelact | celact | lactati | lactacting | lactatin
                | lactatting | lactatng
                | nursing | suckling
                ) \b
            """,
        ),
        VOCAB.part("not", r" \b ( not | non | no ) "),
        VOCAB.part(
            "post",
            r""" \b (
                (( just | recently ) \s+ )? finished
                | post | recently | recent | had | pre
            ) """,
        ),
"""Parse placental scar counts."""

from traiter.old.vocabulary import Vocabulary
from traiter.util import as_list

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait
from vertnet.pylib.util import to_positive_int

VOCAB = Vocabulary(patterns.VOCAB)

SUB = {"l": "left", "r": "right", "m": "male", "f": "female"}


def convert_count(token):
    """Convert parsed tokens into a result."""
    trait = Trait(start=token.start, end=token.end)

    trait.value = to_positive_int(token.group.get("value"))
    count1 = to_positive_int(token.group.get("count1"))
    count2 = to_positive_int(token.group.get("count2"))
    side1 = SUB.get(token.group.get("side1", " ").lower()[0], "side1")
    side2 = SUB.get(token.group.get("side2", " ").lower()[0], "side2")

    if not trait.value:
        trait.value = count1 + count2

    if count1 or side1 != "side1":
        setattr(trait, side1, count1)
Exemplo n.º 18
0
"""Find taxon notations on herbarium specimen labels."""

import pandas as pd
from traiter.old.vocabulary import LOWEST, Vocabulary

from digi_leap.parsers.base import Base
from digi_leap.pylib import const, patterns
from digi_leap.pylib.trait import Trait

PLANT_FAMILIES = const.DATA_DIR / 'itis_plant_families.csv'
PLANT_GENERA = const.DATA_DIR / 'itis_plant_genera.csv'

VOCAB = Vocabulary(patterns.VOCAB)
VOCAB.part('word', r' \S+ ', capture=False, priority=LOWEST)

DATA = pd.read_csv(PLANT_FAMILIES, na_filter=False, dtype=str)
VOCAB.term('plant_family', DATA['complete_name'].tolist())

DATA = pd.read_csv(PLANT_GENERA, na_filter=False, dtype=str)
VOCAB.term('plant_genus', DATA['complete_name'].tolist())


def convert(token):
    """Normalize a parsed taxon notation"""
    return Trait(start=token.start, end=token.end, value=token.group['value'])


PLANT_TAXON = Base(name='plant_taxon',
                   rules=[
                       VOCAB['eol'],
                       VOCAB.producer(convert,
Exemplo n.º 19
0
"""Shared token patterns."""

from traiter.old.vocabulary import FIRST, LOWEST, Vocabulary

VOCAB = Vocabulary()

# Chars that may be a token
VOCAB.part('slash', r' [/] ', capture=False)
VOCAB.part('dash', r' (?: – | - ) ', capture=False)
VOCAB.part('open', r' [(\[] ', capture=False)
VOCAB.part('close', r' [)\]] ', capture=False)
VOCAB.part('x', r' [x×] ', capture=False)
VOCAB.part('quest', r' [?] ')
VOCAB.part('comma', r' [,] ', capture=False, priority=LOWEST)
VOCAB.part('semicolon', r' [;] ', capture=False, priority=LOWEST)
VOCAB.part('ampersand', r' [&] ', capture=False)
VOCAB.part('eq', r' [=] ', capture=False)
VOCAB.part('under', r' [_] ', capture=False)
VOCAB.part('eol', r' [\n\r\f] ', capture=False)
VOCAB.part('dot', r' [.] ', capture=False)

# Small words
VOCAB.part('by', r' by ', capture=False)
VOCAB.part('to', r' to ', capture=False)
VOCAB.part('with', r' with ', capture=False)
VOCAB.part('up_to', r' ( up \s+ )? to ', capture=False)
VOCAB.term('and', r' and ', capture=False)
VOCAB.term('conj', ' or and '.split(), capture=False)
VOCAB.term('prep', ' to with on of '.split(), capture=False)

VOCAB.term('word', r' [a-z] \w* ', capture=False, priority=LOWEST)
Exemplo n.º 20
0
"""Parse embryo counts."""

from traiter.old.vocabulary import Vocabulary
from traiter.util import as_list

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait
from vertnet.pylib.util import to_positive_int

VOCAB = Vocabulary(patterns.VOCAB)

SUB = {"l": "left", "r": "right", "m": "male", "f": "female"}


def convert(token):
    """Convert parsed tokens into a result."""
    trait = Trait(start=token.start, end=token.end)

    if token.group.get("total"):
        trait.value = to_positive_int(token.group["total"])

    if token.group.get("subcount"):
        trait.value = sum(
            to_positive_int(c) for c in as_list(token.group["subcount"]))

    if token.group.get("subcount") and token.group.get("sub"):
        for count, sub in zip(as_list(token.group["subcount"]),
                              as_list(token.group.get("sub"))):
            count = "1" if count == "!" else count
            sub = SUB.get(sub[0].lower(), sub)
Exemplo n.º 21
0
"""Parse lactation state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait."""
    trait = Trait(
        value="lactating" if token.group.get("pos") else "not lactating",
        start=token.start,
        end=token.end,
    )
    return trait


LACTATION_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part(
            "lactating",
            r""" (
                lactating | lactation | lactated | lactate | lact
                | lactaing | lactacting | lactataing | lactational
                | oelact | celact | lactati | lactacting | lactatin
                | lactatting | lactatng
Exemplo n.º 22
0
"""Patterns for names."""

import pandas as pd
from traiter.old.vocabulary import Vocabulary

from digi_leap.pylib import patterns
from digi_leap.pylib.const import DATA_DIR

NAME_CSV = DATA_DIR / 'name_parts.csv'

SUFFIXES = 'filho ii iii jr sr'.split()

VOCAB = Vocabulary(patterns.VOCAB)


def build_name_parts():
    """Build name patterns."""
    df = pd.read_csv(NAME_CSV, na_filter=False, dtype=str)
    VOCAB.term('name_part', df['name'].tolist(), capture=False)


build_name_parts()

VOCAB.term('suffix', SUFFIXES)
VOCAB.term('initial', r'[[:alpha:]] (?! \s* \d+ )')
Exemplo n.º 23
0
"""Patterns for US counties."""

from collections import defaultdict

import pandas as pd
from traiter.old.vocabulary import Vocabulary

from digi_leap.parsers import us_states
from digi_leap.pylib import const

COUNTY_CSV = const.DATA_DIR / 'US_counties.csv'

VOCAB = Vocabulary(us_states.VOCAB)


def build_counties():
    """Read the CSV file and build counties by state."""
    counties = defaultdict(list)
    df = pd.read_csv(COUNTY_CSV, na_filter=False, dtype=str)
    for _, row in df.iterrows():
        counties[row['State']].append(row['County'])

    us_county = []
    for abbrev in us_states.STATES.values():
        names = []
        for name in [n for n in counties[abbrev] if n not in us_states.STATES]:
            name = name.replace('.', r'\.?')
            name = name.replace("'", "'?")
            name = name.replace(' ', r'\s?')
            name = name.replace('-', r'[\s-]?')
            names.append(name)
Exemplo n.º 24
0
"""Parse embryo lengths."""

from traiter.old.vocabulary import Vocabulary
from traiter.util import as_list, to_positive_float

import vertnet.pylib.convert_units as convert_units
import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import add_flags, fix_up_inches, simple
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)

TOO_BIG = 1000


def convert(token):
    """Convert parsed token into a trait product."""
    trait = simple(token, units="len_units")
    return trait if all(x < TOO_BIG for x in as_list(trait.value)) else None


def isolate(token):
    """Convert parsed token into a trait product."""
    token.group["number"] = [
        v.strip() for v in token.group["value"].split("x")
    ]
    return convert(token)


def convert_many(token):
Exemplo n.º 25
0
"""Shared reproductive trait tokens (testes & ovaries)."""

from traiter.old.vocabulary import LOWEST, Vocabulary

import vertnet.pylib.patterns as patterns

VOCAB = Vocabulary(patterns.VOCAB)

VOCAB.term("sex", "females? | males? | [f]")

VOCAB.term("active", "active inactive".split())
VOCAB.part("and", r" ( and \b | [&] ) ")
VOCAB.term("count", r"""( only | all | both )? \s* [12]""")

VOCAB.term(
    "color",
    r""" (( dark | light | pale ) \s* )?
         ( red | pink | brown | black | white | pigmented )
    """,
)

VOCAB.term("texture", " smooth ")

VOCAB.term("covered", " covered ")

VOCAB.term("destroyed", "destroy(ed)?")

VOCAB.part(
    "size",
    r"""
        ( very \s+ )?
Exemplo n.º 26
0
"""Parse forearm length notations."""

from functools import partial

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import fix_up_inches, fraction, shorthand_length, simple

VOCAB = Vocabulary(patterns.VOCAB)


def fix_up(trait, text):
    """Fix problematic parses."""
    # Try to disambiguate doubles quotes from inches
    return fix_up_inches(trait, text)


FOREARM_LENGTH = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Units are in the key, like: ForearmLengthInMillimeters
        VOCAB.term(
            "key_with_units",
            r"""( forearm \s* )? \s* ( length | len ) \s* in \s*
                    (?P<units> millimeters | mm )
            """,
        ),
        # Standard keywords that indicate a forearm length follows
Exemplo n.º 27
0
"""Shared token patterns."""

from traiter.old.vocabulary import FIRST, LOWEST, Vocabulary

from vertnet.pylib.util import NUM_WORDS, ORDINALS

VOCAB = Vocabulary()

# Chars that may be a token
VOCAB.part("slash", r" [/] ", capture=False)
VOCAB.part("dash", r" \p{Pd} ", capture=False)
VOCAB.part("open", r" \p{Ps} ", capture=False)
VOCAB.part("close", r" \p{Pe} ", capture=False)
VOCAB.part("x", r" [x×] ", capture=False)
VOCAB.part("quest", r" [?] ")
VOCAB.part("comma", r" [,] ", capture=False, priority=LOWEST)
VOCAB.part("semicolon", r" [;] ", capture=False, priority=LOWEST)
VOCAB.part("colon", r" [:] ", capture=False, priority=LOWEST)
VOCAB.part("ampersand", r" [&] ", capture=False)
VOCAB.part("eq", r" [=] ", capture=False)
VOCAB.part("plus", r" [+] ", capture=False)
VOCAB.part("under", r" [_] ", capture=False)
VOCAB.part("eol", r" [\n\r\f] ", capture=False)
VOCAB.part("dot", r" [.] ", capture=False)

# Small words
VOCAB.part("by", r" by ", capture=False)
VOCAB.part("to", r" to ", capture=False)
VOCAB.part("with", r" with ", capture=False)
VOCAB.part("up_to", r" ( up \s+ )? to ", capture=False)
VOCAB.term("and", r" and ", capture=False)
Exemplo n.º 28
0
"""Parse lactation state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait
from vertnet.pylib.util import to_positive_int

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert single value tokens into a result."""
    value = token.group.get("value")

    if not value:
        return None

    trait = Trait(start=token.start, end=token.end)
    trait.value = to_positive_int(value)

    if trait.value > 100:
        return None

    if token.group.get("notation"):
        trait.notation = token.group["notation"]

    return trait

Exemplo n.º 29
0
"""Parse tail length notations."""

from functools import partial

import regex
from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import fix_up_inches, fraction, shorthand_length, simple
from vertnet.pylib.util import FLAGS

VOCAB = Vocabulary(patterns.VOCAB)

# How far to look into the surrounding context to disambiguate the parse
LOOK_BACK_FAR = 40
LOOK_BACK_NEAR = 20

# These indicate that the parse is not really for a tail length
IS_TESTES = regex.compile(
    " reproductive | gonad | test | scrotal | scrotum | scrot ", FLAGS)
IS_ELEVATION = regex.compile(" elevation | elev ", FLAGS)
IS_TOTAL = regex.compile(" body | nose | snout ", FLAGS)
IS_TAG = regex.compile(" tag ", FLAGS)
IS_ID = regex.compile(" identifier | ident | id ", FLAGS)


def fix_up(trait, text):
    """Fix problematic parses."""
    # Check that this isn't a total length trait
    start = max(0, trait.start - LOOK_BACK_NEAR)
Exemplo n.º 30
0
"""Parse testes state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait producer."""
    trait = Trait(
        value="scrotal" if token.group.get("pos") else "not scrotal",
        start=token.start,
        end=token.end,
    )
    return trait


SCROTAL_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()),
        VOCAB.term("scrotal_abbrev_pos", "sc".split()),
        VOCAB.term("scrotal_abbrev_neg", "ns ".split()),

        # If possible exclude length. Ex: reproductive data=testes: 11x7 mm
        VOCAB.grouper("length", "cross len_units?"),
        VOCAB.producer(convert, """ (?P<pos> scrotal_pos ) """),