def mentions(): """Writes a table indexing sentences mentioning ignimbrites""" # Filter by lemmas using the PostgreSQL engine directly # This is much quicker than filtering in Python. # In general, all logic that can be pushed to SQL should be... run_query('create_mention_table') table = reflect_table('ignimbrite_mention') res = session.query(nlp).filter(nlp.c.lemmas.overlap(ignimbrite_terms)) for row in res: sentence = Sentence(row) print(sentence.document, sentence.id) ignimbrite_words = (w for w in sentence if w.lemma in ignimbrite_terms) for word in ignimbrite_words: refs = [ w for w in sentence.words_referencing(word) if w.is_adjective or w.is_adverb or w.is_verb ] print(word, " ".join(str(s) for s in refs)) print() stmt = insert(table).values(docid=sentence.document, sentid=sentence.id, wordidx=word.index, word=str(word), refs=[str(w) for w in refs], ref_poses=[w.pose for w in refs]) session.execute(stmt) session.commit()
def ages(): run_query('create_ages_table') table = reflect_table('ignimbrite_age') res = (session.query(nlp).filter(nlp.c.lemmas.overlap(age_terms)).filter( nlp.c.lemmas.overlap(ignimbrite_terms))) age_range = re.compile( "(\d+(?:\.\d+)?)(?: ± (\d+(?:\.\d+)?))?(?: ?(-*|to|and) ?(\d+(?:\.\d+)?))? ?([Mk]a)" ) for row in res: sentence = Sentence(row) __ = age_range.findall(str(sentence)) for match in __: (age, error, sep, end_age, unit) = match def fix_age(val): if val == '': val = None if val is None: return val val = float(val) if unit == 'ka': val /= 1000 return val stmt = insert(table).values(docid=sentence.document, sentid=sentence.id, age=fix_age(age), error=fix_age(error), end_age=fix_age(end_age)) session.execute(stmt) session.commit()
def named_locations(): run_query('create_named_locations_table') table = reflect_table('ignimbrite_named_location') res = session.query(nlp).filter( nlp.c.ners.overlap(['LOCATION'])) for row in res: sentence = Sentence(row) loc_ixs = [i for i,v in enumerate(sentence) if v.ner == 'LOCATION'] phrases = [] for i in loc_ixs: if i-1 in loc_ixs: phrases[-1] += f" {sentence[i]}" elif i-2 in loc_ixs: phrases[-1] += f" {sentence[i-1]} {sentence[i]}" else: phrases.append(str(sentence[i])) for phrase in phrases: stmt = insert(table).values( phrase=phrase, docid=sentence.document, sentid=sentence.id) session.execute(stmt) session.commit()
def ages(): run_query('create_ages_table') table = reflect_table('global_geology_age') res = session.query(nlp).filter(nlp.c.lemmas.overlap(age_terms)) age_range = re.compile( "(\d+(?:\.\d+)?)(?: ± (\d+(?:\.\d+)?))?(?: ?(-*|to|and) ?(\d+(?:\.\d+)?))? ?([Mk]a)" ) for row in res: sentence = Sentence(row) __ = age_range.findall(str(sentence)) for match in __: (age, error, sep, end_age, unit) = match stmt = insert(table).values(docid=sentence.document, sentid=sentence.id, age=fix_age(age, unit), error=fix_age(error, unit), end_age=fix_age(end_age, unit)) session.execute(stmt) session.commit()
def units(): """Writes a table containing geologic unit mentions""" # Instead of creating table in raw SQL and then reflecting, # we could define it's schema directly in the SQLAlchemy ORM. run_query('create_unit_table') table = reflect_table('global_geology_unit') # query the nlp output (sentences table, see 'database.py'). look at the lemmas column and get all the rows that have a unit_type (see above list definition) in lemmas. res = session.query(nlp).filter(nlp.c.lemmas.overlap(unit_types)) # Get unit periods from macrostrat periods = [r[0] for r in run_query('get_periods')] for row in res: sentence = Sentence(row) # iterate pairwise through units, as each `unit_type` # must be preceded by at least one proper name for word in sentence: if not word.lemma in unit_types: continue __ = [word] prev = word.previous() # Upper, middle, lower, etc. position = None # Period period = None # Hack to allow continue from within while loop # ...there is probably a cleaner way to do this _should_exit = False while prev is not None: if not prev.is_proper_noun: break # Should institute a check for geologic unit map ids e.g. # `Tsvl`, `Qal` as these seem to be categorized as proper nouns. if prev.lemma in terms('Working', 'Research', 'Data'): # Often `Groups` are actually functional groups of people! __ = None break if prev.lemma in terms('Upper', 'Middle', 'Lower'): # Filter out upper, middle lower position = str(prev) break if any(p in prev.lemma for p in periods): # The unit is preceded with word containing an identified geological period period = str(prev) break if prev.lemma in unit_types: # We are stepping on previously identified units that are adjacent break if prev.lemma in age_terms: break # Build an array back to front catching multiword units __.append(prev) prev = prev.previous() if __ is None or len(__) < 2: continue __.reverse() name = " ".join(str(i) for i in __) print(name) stmt = insert(table).values(name=name, short_name=" ".join( str(i) for i in __[:-1]), position=position, period=period, docid=sentence.document, sentid=sentence.id) session.execute(stmt) session.commit()
def locations(): "Get locations in degrees" run_query('create_locations_table') table = reflect_table('ignimbrite_location') # We want to employ more complex logic here, # so we define the query directly in SQL res = run_query('get_location_sentences') # Regex to parse common DMS and DD location coordinates expr = re.compile(" ((\d+(?:\.\d+)?)°([\d '`\"]+)([NSEW]))") # Regex to parse possible minute-second pairs to numbers expr2 = re.compile("[\d\.]+") def dms2dd(degrees, minutes=0, seconds=0): return degrees + minutes/60 + seconds/3600 for row in res: lats = [] lons = [] sentence = Sentence(row) text = str(sentence) pos = 0 matches = expr.findall(text) if len(matches) < 2: # We need at least two to have a hope of # finding a lat-lon pair continue for match, deg, minute_second, cardinal_direction in matches: deg = float(deg) if not minute_second.isspace(): ms = expr2.findall(minute_second) def __get_value(ix): try: return float(ms[ix]) except IndexError: return 0 deg = dms2dd(deg, minutes=__get_value(0), seconds=__get_value(1)) if cardinal_direction in ['S','W']: deg *= -1 if cardinal_direction in ('N','S'): lats.append(deg) else: lons.append(deg) if not len(lons)*len(lats): continue # Get rid of sentences where there is too # wide a spread of lon/lat values (probably # signifying some sort of map labels). if max(lons)-min(lons) > 5: continue if max(lats)-min(lats) > 5: continue # We average for now # ...more interesting would be to create # and record bounding boxes mean = lambda x: sum(x)/len(x) lon = mean(lons) lat = mean(lats) print(sentence) print(lon, lat) print("") point = from_shape(Point(lon,lat),srid=4326) stmt = insert(table).values( geometry=point, docid=sentence.document, sentid=sentence.id) session.execute(stmt) session.commit()
def locations(): "Get locations in degrees" run_query('create_locations_table') table = reflect_table('global_geology_location') # We want to employ more complex logic here, # so we define the query directly in SQL res = run_query('get_location_sentences') # Regex to parse common DMS and DD location coordinates expr = re.compile("[\s]((\d+(?:\.\d+)?)[°◦]([\d\s′'`\"]*)([NSEW]))\W") # Regex to parse possible minute-second pairs to numbers expr2 = re.compile("[\d\.]+") for row in res: lats = [] lons = [] sentence = Sentence(row) # Pad sentence with spaces so our regex will match coordinates # at the beginning and end of a line. text = f" {sentence} " pos = 0 matches = expr.findall(text) if len(matches) < 2: # We need at least two matches to have any hope of # finding an X-Y coordinate pair continue for match, deg, minute_second, cardinal_direction in matches: deg = float(deg) if not minute_second.isspace(): ms = expr2.findall(minute_second) def __get_value(ix): try: return float(ms[ix]) except IndexError: return 0 deg = dms2dd(deg, minutes=__get_value(0), seconds=__get_value(1)) if cardinal_direction in ['S', 'W']: deg *= -1 if cardinal_direction in ('N', 'S'): lats.append(deg) else: lons.append(deg) if not len(lons) * len(lats): continue # Get rid of sentences where there is too # wide a spread of lon/lat values (probably # signifying some sort of map labels). if max(lons) - min(lons) > 5: continue if max(lats) - min(lats) > 5: continue # We average for now # ...more interesting would be to create # and record bounding boxes mean = lambda x: sum(x) / len(x) lon = mean(lons) lat = mean(lats) print(sentence) secho(f"{len(lons)} longitudes and {len(lats)} latitudes found", fg='green') secho(f"{lon} {lat}", fg='green') print("") point = from_shape(Point(lon, lat), srid=4326) stmt = insert(table).values(geometry=point, docid=sentence.document, sentid=sentence.id, sentence=str(sentence)) session.execute(stmt) session.commit()