Пример #1
0
def mentions():
    """Writes a table indexing sentences mentioning ignimbrites"""
    # Filter by lemmas using the PostgreSQL engine directly
    # This is much quicker than filtering in Python.
    # In general, all logic that can be pushed to SQL should be...

    run_query('create_mention_table')
    table = reflect_table('ignimbrite_mention')

    res = session.query(nlp).filter(nlp.c.lemmas.overlap(ignimbrite_terms))

    for row in res:
        sentence = Sentence(row)
        print(sentence.document, sentence.id)
        ignimbrite_words = (w for w in sentence if w.lemma in ignimbrite_terms)
        for word in ignimbrite_words:
            refs = [
                w for w in sentence.words_referencing(word)
                if w.is_adjective or w.is_adverb or w.is_verb
            ]

            print(word, " ".join(str(s) for s in refs))
            print()
            stmt = insert(table).values(docid=sentence.document,
                                        sentid=sentence.id,
                                        wordidx=word.index,
                                        word=str(word),
                                        refs=[str(w) for w in refs],
                                        ref_poses=[w.pose for w in refs])
            session.execute(stmt)
        session.commit()
Пример #2
0
def ages():
    run_query('create_ages_table')
    table = reflect_table('ignimbrite_age')

    res = (session.query(nlp).filter(nlp.c.lemmas.overlap(age_terms)).filter(
        nlp.c.lemmas.overlap(ignimbrite_terms)))

    age_range = re.compile(
        "(\d+(?:\.\d+)?)(?: ± (\d+(?:\.\d+)?))?(?: ?(-*|to|and) ?(\d+(?:\.\d+)?))? ?([Mk]a)"
    )
    for row in res:
        sentence = Sentence(row)
        __ = age_range.findall(str(sentence))
        for match in __:
            (age, error, sep, end_age, unit) = match

            def fix_age(val):
                if val == '':
                    val = None
                if val is None:
                    return val
                val = float(val)
                if unit == 'ka':
                    val /= 1000
                return val

            stmt = insert(table).values(docid=sentence.document,
                                        sentid=sentence.id,
                                        age=fix_age(age),
                                        error=fix_age(error),
                                        end_age=fix_age(end_age))
            session.execute(stmt)
        session.commit()
def named_locations():
    run_query('create_named_locations_table')
    table = reflect_table('ignimbrite_named_location')

    res = session.query(nlp).filter(
        nlp.c.ners.overlap(['LOCATION']))

    for row in res:
        sentence = Sentence(row)
        loc_ixs = [i for i,v in enumerate(sentence) if v.ner == 'LOCATION']
        phrases = []
        for i in loc_ixs:
            if i-1 in loc_ixs:
                phrases[-1] += f" {sentence[i]}"
            elif i-2 in loc_ixs:
                phrases[-1] += f" {sentence[i-1]} {sentence[i]}"
            else:
                phrases.append(str(sentence[i]))

        for phrase in phrases:
            stmt = insert(table).values(
                phrase=phrase,
                docid=sentence.document,
                sentid=sentence.id)
            session.execute(stmt)

    session.commit()
Пример #4
0
def ages():
    run_query('create_ages_table')
    table = reflect_table('global_geology_age')

    res = session.query(nlp).filter(nlp.c.lemmas.overlap(age_terms))

    age_range = re.compile(
        "(\d+(?:\.\d+)?)(?: ± (\d+(?:\.\d+)?))?(?: ?(-*|to|and) ?(\d+(?:\.\d+)?))? ?([Mk]a)"
    )
    for row in res:
        sentence = Sentence(row)
        __ = age_range.findall(str(sentence))
        for match in __:
            (age, error, sep, end_age, unit) = match

            stmt = insert(table).values(docid=sentence.document,
                                        sentid=sentence.id,
                                        age=fix_age(age, unit),
                                        error=fix_age(error, unit),
                                        end_age=fix_age(end_age, unit))
            session.execute(stmt)
        session.commit()
Пример #5
0
def units():
    """Writes a table containing geologic unit mentions"""

    # Instead of creating table in raw SQL and then reflecting,
    # we could define it's schema directly in the SQLAlchemy ORM.
    run_query('create_unit_table')
    table = reflect_table('global_geology_unit')

    # query the nlp output (sentences table, see 'database.py'). look at the lemmas column and get all the rows that have a unit_type (see above list definition) in lemmas.
    res = session.query(nlp).filter(nlp.c.lemmas.overlap(unit_types))

    # Get unit periods from macrostrat
    periods = [r[0] for r in run_query('get_periods')]

    for row in res:
        sentence = Sentence(row)
        # iterate pairwise through units, as each `unit_type`
        # must be preceded by at least one proper name
        for word in sentence:
            if not word.lemma in unit_types:
                continue
            __ = [word]
            prev = word.previous()

            # Upper, middle, lower, etc.
            position = None
            # Period
            period = None

            # Hack to allow continue from within while loop
            # ...there is probably a cleaner way to do this
            _should_exit = False
            while prev is not None:
                if not prev.is_proper_noun:
                    break
                # Should institute a check for geologic unit map ids e.g.
                # `Tsvl`, `Qal` as these seem to be categorized as proper nouns.
                if prev.lemma in terms('Working', 'Research', 'Data'):
                    # Often `Groups` are actually functional groups of people!
                    __ = None
                    break
                if prev.lemma in terms('Upper', 'Middle', 'Lower'):
                    # Filter out upper, middle lower
                    position = str(prev)
                    break
                if any(p in prev.lemma for p in periods):
                    # The unit is preceded with word containing an identified geological period
                    period = str(prev)
                    break
                if prev.lemma in unit_types:
                    # We are stepping on previously identified units that are adjacent
                    break
                if prev.lemma in age_terms:
                    break

                # Build an array back to front catching multiword units
                __.append(prev)
                prev = prev.previous()
            if __ is None or len(__) < 2:
                continue
            __.reverse()

            name = " ".join(str(i) for i in __)
            print(name)
            stmt = insert(table).values(name=name,
                                        short_name=" ".join(
                                            str(i) for i in __[:-1]),
                                        position=position,
                                        period=period,
                                        docid=sentence.document,
                                        sentid=sentence.id)
            session.execute(stmt)
        session.commit()
def locations():
    "Get locations in degrees"
    run_query('create_locations_table')
    table = reflect_table('ignimbrite_location')

    # We want to employ more complex logic here,
    # so we define the query directly in SQL
    res = run_query('get_location_sentences')

    # Regex to parse common DMS and DD location coordinates
    expr = re.compile(" ((\d+(?:\.\d+)?)°([\d '`\"]+)([NSEW]))")

    # Regex to parse possible minute-second pairs to numbers
    expr2 = re.compile("[\d\.]+")

    def dms2dd(degrees, minutes=0, seconds=0):
        return degrees + minutes/60 + seconds/3600

    for row in res:
        lats = []
        lons = []
        sentence = Sentence(row)
        text = str(sentence)
        pos = 0
        matches = expr.findall(text)
        if len(matches) < 2:
            # We need at least two to have a hope of
            # finding a lat-lon pair
            continue
        for match, deg, minute_second, cardinal_direction in matches:
            deg = float(deg)
            if not minute_second.isspace():
                ms = expr2.findall(minute_second)
                def __get_value(ix):
                    try:
                        return float(ms[ix])
                    except IndexError:
                        return 0

                deg = dms2dd(deg,
                    minutes=__get_value(0),
                    seconds=__get_value(1))
            if cardinal_direction in ['S','W']:
                deg *= -1
            if cardinal_direction in ('N','S'):
                lats.append(deg)
            else:
                lons.append(deg)
        if not len(lons)*len(lats):
            continue
        # Get rid of sentences where there is too
        # wide a spread of lon/lat values (probably
        # signifying some sort of map labels).
        if max(lons)-min(lons) > 5:
            continue
        if max(lats)-min(lats) > 5:
            continue

        # We average for now
        # ...more interesting would be to create
        # and record bounding boxes
        mean = lambda x: sum(x)/len(x)
        lon = mean(lons)
        lat = mean(lats)

        print(sentence)
        print(lon, lat)
        print("")

        point = from_shape(Point(lon,lat),srid=4326)

        stmt = insert(table).values(
            geometry=point,
            docid=sentence.document,
            sentid=sentence.id)
        session.execute(stmt)
    session.commit()
Пример #7
0
def locations():
    "Get locations in degrees"
    run_query('create_locations_table')
    table = reflect_table('global_geology_location')

    # We want to employ more complex logic here,
    # so we define the query directly in SQL
    res = run_query('get_location_sentences')

    # Regex to parse common DMS and DD location coordinates
    expr = re.compile("[\s]((\d+(?:\.\d+)?)[°◦]([\d\s′'`\"]*)([NSEW]))\W")

    # Regex to parse possible minute-second pairs to numbers
    expr2 = re.compile("[\d\.]+")

    for row in res:
        lats = []
        lons = []
        sentence = Sentence(row)
        # Pad sentence with spaces so our regex will match coordinates
        # at the beginning and end of a line.
        text = f" {sentence} "
        pos = 0
        matches = expr.findall(text)
        if len(matches) < 2:
            # We need at least two matches to have any hope of
            # finding an X-Y coordinate pair
            continue
        for match, deg, minute_second, cardinal_direction in matches:
            deg = float(deg)
            if not minute_second.isspace():
                ms = expr2.findall(minute_second)

                def __get_value(ix):
                    try:
                        return float(ms[ix])
                    except IndexError:
                        return 0

                deg = dms2dd(deg,
                             minutes=__get_value(0),
                             seconds=__get_value(1))
            if cardinal_direction in ['S', 'W']:
                deg *= -1
            if cardinal_direction in ('N', 'S'):
                lats.append(deg)
            else:
                lons.append(deg)
        if not len(lons) * len(lats):
            continue
        # Get rid of sentences where there is too
        # wide a spread of lon/lat values (probably
        # signifying some sort of map labels).
        if max(lons) - min(lons) > 5:
            continue
        if max(lats) - min(lats) > 5:
            continue

        # We average for now
        # ...more interesting would be to create
        # and record bounding boxes
        mean = lambda x: sum(x) / len(x)
        lon = mean(lons)
        lat = mean(lats)

        print(sentence)
        secho(f"{len(lons)} longitudes and {len(lats)} latitudes found",
              fg='green')
        secho(f"{lon} {lat}", fg='green')
        print("")

        point = from_shape(Point(lon, lat), srid=4326)

        stmt = insert(table).values(geometry=point,
                                    docid=sentence.document,
                                    sentid=sentence.id,
                                    sentence=str(sentence))
        session.execute(stmt)
    session.commit()