def hasNthFromStart(tpentity, ref_list): refStart_span, refEnd_span = tpentity.getSpan() # convert to all lower text = tpentity.getText().lower() # remove all punctuation text_norm = text.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) # convert to list text_list = text_norm.split(" ") ## if the term does not exist by itself it may be a substring. Go through each word in the TimePhrase string and see if a substring matches. for t in text_list: val = utils.isOrdinal(t) if val is not None: start_idx, end_idx = Chrono.utils.calculateSpan(text_norm, t) # now get the reference index of this token and see if there are any temporal tokens next to it. idx = utils.getRefIdx(ref_list, refStart_span + start_idx, refStart_span + end_idx) if ref_list[idx - 1].isTemporal() or ref_list[idx + 1].isTemporal(): return True, val, start_idx, end_idx return False, None, None, None #### # END_MODULE ####
def hasSeasonOfYear(tpentity, ref_list): refStart_span, refEnd_span = tpentity.getSpan() # convert to all lower # text_lower = tpentity.getText().lower() text = tpentity.getText().lower() # remove all punctuation text_norm = text.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))).strip() # convert to list text_list = text_norm.split(" ") # define my period lists seasonofyear = [ "summer", "winter", "fall", "spring", "summers", "falls", "winters", "springs" ] # figure out if any of the tokens in the text_list are also in the ampm list intersect = list(set(text_list) & set(seasonofyear)) # only proceed if the intersect list has a length of 1 or more. # For this method I'm assuming it will only be a length of 1, if it is not then we don't know what to do with it. if len(intersect) == 1: term = intersect[0] start_idx, end_idx = calculateSpan(text_norm, term) if term == "summer" or term == "summers": start_idx, end_idx = calculateSpan(text_norm, "summer") absStart = refStart_span + start_idx absEnd = refStart_span + end_idx postag = ref_list[utils.getRefIdx(ref_list, absStart, absEnd)].getPos() if postag == "NN": return True, "Summer", start_idx, end_idx elif term == "winter" or term == "winters": start_idx, end_idx = calculateSpan(text_norm, "winter") absStart = refStart_span + start_idx absEnd = refStart_span + end_idx postag = ref_list[utils.getRefIdx(ref_list, absStart, absEnd)].getPos() if postag == "NN": return True, "Winter", start_idx, end_idx elif term == "fall" or term == "falls": start_idx, end_idx = calculateSpan(text_norm, "fall") absStart = refStart_span + start_idx absEnd = refStart_span + end_idx postag = ref_list[utils.getRefIdx(ref_list, absStart, absEnd)].getPos() if postag == "NN": return True, "Fall", start_idx, end_idx elif term == "spring" or term == "springs": start_idx, end_idx = calculateSpan(text_norm, "spring") absStart = refStart_span + start_idx absEnd = refStart_span + end_idx postag = ref_list[utils.getRefIdx(ref_list, absStart, absEnd)].getPos() if postag == "NN": return True, "Spring", start_idx, end_idx else: return False, None, None, None return False, None, None, None
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText()) boo, val, idxstart, idxend, plural = hasPeriodInterval(s) # FIND terms that are always marked as calendar intervals! if boo and re.search( "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week", s.getText()): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 if re.search("yesterday|yesterdays", s.getText()): my_last_entity = chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=str(chrono_id - 1) + "entity") chrono_id = chrono_id + 1 chrono_list.append(my_last_entity) chrono_list.append(my_entity) # FIND terms that are always marked as periods! elif boo and val == "Unknown": abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=val, number=None) chrono_id = chrono_id + 1 chrono_list.append(my_entity) elif boo: abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token #ref_idx = -1 #for i in range(0,len(ref_list)): # if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))): # ref_idx = i # break ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 #check to see if it has a number associated with it. We assume the number comes before the interval string if idxstart > 0: substr = s.getText()[0:idxstart] m = re.search('([0-9]{1,2})', substr) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(substr) if texNumVal is not None: #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=ref_Sspan, end_span=ref_Sspan + (idxstart - 1), value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr substr = s.getText( )[:idxstart] ## extract entire first part of TimePhrase phrase m = re.search( '([0-9]{1,2})', substr ) #search for an integer in the subphrase and extract it's coordinates if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(numstr) if texNumVal is not None: m = re.search( numstr, substr) #search for the number string in the subphrase if m is not None: abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id
def buildSubIntervals(chrono_list, chrono_id, dct, ref_list): year = None month = None day = None hour = None minute = None second = None daypart = None dayweek = None interval = None period = None nth = None nxt = None this = None tz = None ampm = None modifier = None last = None entity_count = 0 #print("in Build Subintervals") ## loop through all entities and pull out the approriate IDs for e in range(0,len(chrono_list)): #print(chrono_list[e].get_id()) e_type = chrono_list[e].get_type() #print("E-type: " + e_type) if e_type == "Two-Digit-Year" or e_type == "Year": year = e entity_count = entity_count + 1 # print("YEAR VALUE: " + str(chrono_list[e].get_value())) elif e_type == "Month-Of-Year": # print("FOUND Month") month = e entity_count = entity_count + 1 elif e_type == "Day-Of-Month": day = e entity_count = entity_count + 1 elif e_type == "Hour-Of-Day": hour = e entity_count = entity_count + 1 elif e_type == "Minute-Of-Hour": minute = e entity_count = entity_count + 1 elif e_type == "Second-Of-Minute": second = e entity_count = entity_count + 1 elif e_type == "Part-Of-Day": daypart = e entity_count = entity_count + 1 elif e_type == "Day-Of-Week": dayweek = e entity_count = entity_count + 1 elif e_type == "Calendar-Interval": interval = e entity_count = entity_count + 1 elif e_type == "Period": period = e entity_count = entity_count + 1 elif e_type == "NthFromStart": nth = e entity_count = entity_count + 1 elif e_type == "Next": nxt = e entity_count = entity_count + 1 elif e_type == "This": this = e entity_count = entity_count + 1 elif e_type == "Time-Zone": tz = e entity_count = entity_count + 1 elif e_type == "AMPM-Of-Day": ampm = e entity_count = entity_count + 1 elif e_type == "Modifier": modifier = e entity_count = entity_count + 1 elif e_type == "Last": last = e entity_count = entity_count + 1 ## Now add additional NEXT and LAST entities where needed ## Need to edit to figure out if a modifier word exists first, then test for year, etc. ## need to look specifically for modifier words in the other methods. This method catches full dates that are next or last with no modifier words. ## update: I now have a buildLast() method that identifies the modifier words. if year is None: if dct is not None: if month is not None and this is None and nxt is None and last is None: mStart = chrono_list[month].get_start_span() mEnd = chrono_list[month].get_end_span() my_month = utils.getMonthNumber(chrono_list[month].get_month_type()) if day is not None and my_month == dct.month: # add a Last if chrono_list[day].get_value() <= dct.day: chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[month].get_id())) chrono_id = chrono_id + 1 elif chrono_list[day].get_value() > dct.day: chrono_list.append(chrono.ChronoNextOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[month].get_id())) chrono_id = chrono_id + 1 elif my_month < dct.month: chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[month].get_id())) chrono_id = chrono_id + 1 elif my_month > dct.month: chrono_list.append(chrono.ChronoNextOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[month].get_id())) chrono_id = chrono_id + 1 ##having a problem where a past day is being referenced without it being explicit. ##need to look at the closest preceding verb tense to see if it is past or present I think. ##will need the reference list to do this. if dayweek is not None and this is None and nxt is None and last is None: mStart = chrono_list[dayweek].get_start_span() mEnd = chrono_list[dayweek].get_end_span() #Get ref idx for this token ref = utils.getRefIdx(ref_list, mStart, mEnd) vb = None while vb is None and ref != 0: if "VB" in ref_list[ref].getPos(): if ref_list[ref].getPos() in ["VBD","VBN"]: #past tense so put as a last chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[dayweek].get_id())) chrono_id = chrono_id + 1 # print("FOUND DAYWEEK LAST") elif ref_list[ref].getPos() in ["VB","VBG","VBP","VBZ"]: #present tense so put as a next chrono_list.append(chrono.ChronoNextOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[dayweek].get_id())) chrono_id = chrono_id + 1 # print("FOUND DAYWEEK NEXT") vb = True # print("Ref Tok: " + str(ref)) ref-=1 ''' weekdays = {"Monday":0, "Tuesday":1, "Wednesday":2, "Thursday":3, "Friday":4, "Saturday":5, "Sunday":6} ##Monday is 0 and Sunday is 6 dct_day = dct.weekday() ##need convert the doctime to a day of week my_dayweek = weekdays[chrono_list[dayweek].get_day_type()] if my_dayweek < dct_day: chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[dayweek].get_id())) chrono_id = chrono_id + 1 print("FOUND DAYWEEK LAST") elif my_dayweek > dct_day: chrono_list.append(chrono.ChronoNextOperator(entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=chrono_list[dayweek].get_id())) chrono_id = chrono_id + 1 print("FOUND DAYWEEK NEXT") ''' ## Now assign all sub-intervals if second is not None and minute is not None: chrono_list[minute].set_sub_interval(chrono_list[second].get_id()) if minute is not None and hour is not None: #print("Linking entities " + str(minute) + " and " + str(hour)) chrono_list[hour].set_sub_interval(chrono_list[minute].get_id()) if hour is not None and day is not None: chrono_list[day].set_sub_interval(chrono_list[hour].get_id()) if day is not None and month is not None: chrono_list[month].set_sub_interval(chrono_list[day].get_id()) if month is not None and year is not None: chrono_list[year].set_sub_interval(chrono_list[month].get_id()) if dayweek is not None and hour is not None: chrono_list[dayweek].set_sub_interval(chrono_list[hour].get_id()) if dayweek is not None and daypart is not None and hour is None: chrono_list[dayweek].set_sub_interval(chrono_list[daypart].get_id()) if day is not None and daypart is not None and hour is None: chrono_list[day].set_sub_interval(chrono_list[daypart].get_id()) if nth is not None and period is not None: # print("Adding period sub-interval") chrono_list[nth].set_period(chrono_list[period].get_id()) elif nth is not None and interval is not None: # print("Adding interval sub-interval") chrono_list[nth].set_repeating_interval(chrono_list[interval].get_id()) ## Test to see if we have a Last entity AND the entity count is only 1 ## If yes, then remove the Last entity ## Current not implementing this, but may need to add it in the future. This removal of entities reduced our recall by half # if last is not None and entity_count == 1: # print("Found a Last without a temporal entity") # del chrono_list[last] reindex = False if ampm is not None and hour is not None: chrono_list[hour].set_ampm(chrono_list[ampm].get_id()) elif ampm is not None and hour is None: # Delete the AMPM entity if no hour associated with it. #print("Deleting AMPM") del chrono_list[ampm] reindex = True ## I know I need to reindex here, but I honestly forgot exactly why. if reindex: for e in range(0,len(chrono_list)): #print(chrono_list[e].get_id()) e_type = chrono_list[e].get_type() if e_type == "Time-Zone": #print("Reindexing Time Zone Value: " + str(chrono_list[e])) tz = e if tz is not None and hour is not None: chrono_list[hour].set_time_zone(chrono_list[tz].get_id()) elif tz is not None and hour is None: # Delete the tz entity if there is no hour to link it to. Not sure if this will work for all cases. #print("Deleting TimeZone") del chrono_list[tz] # Link modifiers if modifier is not None and period is not None: chrono_list[period].set_modifier(chrono_list[modifier].get_id()) elif modifier is not None and interval is not None: chrono_list[interval].set_modifier(chrono_list[modifier].get_id()) elif modifier is not None and period is None and interval is None: # Delete the modifier entity if there is no period or interval to link it to. Not sure if this will work for all cases. #print("Deleting Modifier") del chrono_list[modifier] ##### Notes: This next bit is complicated. If I include it I remove some False Positives, but I also create some False Negatives. ##### I think more complex parsing is needed here to figure out if the ordinal is an NthFromStart or not. ##### I think implementing a machine learning method here may help. #elif nth is not None: # if the nthFromStart does not have a corresponding interval we should remove it from the list. #print("REMOVING NthFromStart: " + str(chrono_list[nth])) #del chrono_list[nth] return chrono_list, chrono_id
def buildDoseDuration(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText()) bad = re.compile(r"^q\d|^Q\d") parts = s.getText().split() containsnum = False #various checks to ensure that this phrase is actually a dose duration if isDoseDuration(parts[0]): return chrono_list, chrono_id if "every" in s.getText().lower() or "time" in s.getText().lower( ) or "per" in s.getText().lower(): return chrono_list, chrono_id if bad.match(s.getText()): return chrono_list, chrono_id if "/" in s.getText(): return chrono_list, chrono_id if "[**" in s.getText() or "**]" in s.getText(): return chrono_list, chrono_id if "ly" in s.getText(): return chrono_list, chrono_id if "(" in s.getText() or ")" in s.getText(): return chrono_list, chrono_id if "once" in s.getText().lower() or "twice" in s.getText().lower(): return chrono_list, chrono_id if "past" in s.getText().lower() or "ago" in s.getText().lower(): return chrono_list, chrono_id if "RANDOM" in s.getText(): return chrono_list, chrono_id for part in parts: part = re.sub('[' + string.punctuation + ']', '', part).strip() for ref in ref_list: if ref.getText().lower() == part.lower(): if (ref.isNumeric()): containsnum = True if utils.isOrdinal(ref.getText()): return chrono_list, chrono_id break elif not tt.hasDoseDuration(ref.getText().lower()): return chrono_list, chrono_id if containsnum == False: return chrono_list, chrono_id boo, val, idxstart, idxend, plural = hasDoseDuration(s) if boo: abs_Sspan = ref_Sspan abs_Espan = ref_Espan # get index of overlapping reference token # ref_idx = -1 # for i in range(0,len(ref_list)): # if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))): # ref_idx = i # break ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=getDoseDurationValue(val), number=None, text=s.getText()) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=val, number=None, text=s.getText()) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 # check to see if it has a number associated with it. We assume the number comes before the interval string chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan abs_Espan = ref_Espan # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=getDoseDurationValue(val), number=None, text=s.getText()) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=val, number=None, text=s.getText()) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr substr = s.getText( )[:idxstart] ## extract entire first part of TimePhrase phrase m = re.search( '([0-9]{1,2})', substr ) #search for an integer in the subphrase and extract it's coordinates chrono_list.append(my_entity) return chrono_list, chrono_id
def hasTextMonth(tpentity, ref_list): refStart_span, refEnd_span = tpentity.getSpan() # convert to all lower text_lower = tpentity.getText().lower() # remove all punctuation # text_norm = text_lower.translate(str.maketrans(",", ' ')).strip() text_norm = text_lower.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))).strip() # convert to list text_list = text_norm.split(" ") # define my month lists full_month = [ "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december" ] # run for full month t_flag = False for tok in text_list: answer = next((m for m in full_month if tok in m), None) if answer is not None and not t_flag: answer2 = next((m for m in full_month if m in tok), None) if answer2 is not None and not t_flag: t_flag = True # answer2 should contain the element that matches. We need to find the span in the original phrase and return the correct value start_idx, end_idx = calculateSpan(text_lower, answer2) absStart = refStart_span + start_idx absEnd = refStart_span + end_idx postag = ref_list[utils.getRefIdx(ref_list, absStart, absEnd)].getPos() if postag == "NNP": if answer2 in ["january"]: return True, "January", start_idx, end_idx elif answer2 in ["february"]: return True, "February", start_idx, end_idx elif answer2 in ["march"]: return True, "March", start_idx, end_idx elif answer2 in ["april"]: return True, "April", start_idx, end_idx elif answer2 in ["may"]: return True, "May", start_idx, end_idx elif answer2 in ["june"]: return True, "June", start_idx, end_idx elif answer2 in ["july"]: return True, "July", start_idx, end_idx elif answer2 in ["august"]: return True, "August", start_idx, end_idx elif answer2 in ["september"]: return True, "September", start_idx, end_idx elif answer2 in ["october"]: return True, "October", start_idx, end_idx elif answer2 in ["november"]: return True, "November", start_idx, end_idx elif answer2 in ["december"]: return True, "December", start_idx, end_idx # run for abbr month abbr_month = [ "jan.", "feb.", "mar.", "apr.", "jun.", "jul.", "aug.", "sept.", "sep.", "oct.", "nov.", "dec." ] adj_punc = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~' text_norm2 = text_lower.translate( str.maketrans(adj_punc, ' ' * len(adj_punc))).strip() # convert to list text_list2 = text_norm2.split(" ") t_flag = False for tok in text_list2: answer = next((m for m in abbr_month if tok in m), None) if answer is not None and not t_flag: answer2 = next((m for m in abbr_month if m in tok), None) if answer2 is not None and not t_flag: t_flag = True # answer2 should contain the element that matches. We need to find the span in the original phrase and return the correct value start_idx, end_idx = calculateSpan(text_lower, answer2) absStart = refStart_span + start_idx absEnd = refStart_span + end_idx postag = ref_list[utils.getRefIdx(ref_list, absStart, absEnd)].getPos() if postag == "NNP": if answer2 in ["jan."]: return True, "January", start_idx, end_idx elif answer2 in ["feb."]: return True, "February", start_idx, end_idx elif answer2 in ["mar."]: return True, "March", start_idx, end_idx elif answer2 in ["apr."]: return True, "April", start_idx, end_idx elif answer2 in ["jun."]: return True, "June", start_idx, end_idx elif answer2 in ["jul."]: return True, "July", start_idx, end_idx elif answer2 in ["aug."]: return True, "August", start_idx, end_idx elif answer2 in ["sept.", "sep."]: return True, "September", start_idx, end_idx elif answer2 in ["oct."]: return True, "October", start_idx, end_idx elif answer2 in ["nov."]: return True, "November", start_idx, end_idx elif answer2 in ["dec."]: return True, "December", start_idx, end_idx # run for abbr month without punctuation abbr_month = [ "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sept", "sep", "oct", "nov", "dec" ] adj_punc = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~' text_norm2 = text_lower.translate( str.maketrans(adj_punc, ' ' * len(adj_punc))).strip() # convert to list text_list2 = text_norm2.split(" ") t_flag = False for tok in text_list2: answer = next((m for m in abbr_month if tok in m), None) if answer is not None and not t_flag: answer2 = next((m for m in abbr_month if m in tok), None) if answer2 is not None and not t_flag: t_flag = True # answer2 should contain the element that matches. We need to find the span in the original phrase and return the correct value start_idx, end_idx = calculateSpan(text_lower, answer2) absStart = refStart_span + start_idx absEnd = refStart_span + end_idx postag = ref_list[utils.getRefIdx(ref_list, absStart, absEnd)].getPos() if postag == "NNP": if answer2 in ["jan"]: return True, "January", start_idx, end_idx elif answer2 in ["feb"]: return True, "February", start_idx, end_idx elif answer2 in ["mar"]: return True, "March", start_idx, end_idx elif answer2 in ["apr"]: return True, "April", start_idx, end_idx elif answer2 in ["jun"]: return True, "June", start_idx, end_idx elif answer2 in ["jul"]: return True, "July", start_idx, end_idx elif answer2 in ["aug"]: return True, "August", start_idx, end_idx elif answer2 in ["sept", "sep"]: return True, "September", start_idx, end_idx elif answer2 in ["oct"]: return True, "October", start_idx, end_idx elif answer2 in ["nov"]: return True, "November", start_idx, end_idx elif answer2 in ["dec"]: return True, "December", start_idx, end_idx return False, None, None, None
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() boo, val, idxstart, idxend, plural = hasPeriodInterval(s) # FIND terms that are always marked as calendar intervals! if boo and re.search( "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week", s.getText()): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 if re.search("yesterday|yesterdays", s.getText()): my_last_entity = chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=str(chrono_id - 1) + "entity") chrono_id = chrono_id + 1 chrono_list.append(my_last_entity) chrono_list.append(my_entity) # FIND terms that are always marked as periods! elif boo and val == "Unknown": abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=val, number=None) chrono_id = chrono_id + 1 chrono_list.append(my_entity) elif boo: abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 #check to see if it has a number associated with it. We assume the number comes before the interval string #to figure out if the number we find is close to the interval token the end of the number string needs to be within 2 characters of the start of the interval token. #I tried just extracting the previous reference token, but that doesn't work because phrases like "42-year-old" are actually one reference token. # So I decided I had to do it the hard way with index arithmetic. The one concern about this method is that I assume there is a space at the end. This could cause some issues down the line. # Yep, we are getting the spans wrong for phrases like "six-months". I am going to test for a space as the last character before just assuming there was one. if idxstart > 0: ## get the absolute span of the interval token abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend ## purposfully split on a single space substr = s.getText()[0:idxstart] # test to see if last character is a space and set a flag. has_space = True if substr[len(substr) - 1] == ' ' else False substr = substr.strip(' ').split(' ') ## get the previous token prevtok = substr[len(substr) - 1] prev_sSpan = idxstart - len( prevtok) - 1 if has_space else idxstart - len(prevtok) prev_eSpan = idxstart - 1 ## get the rest of the substring joined by a space if len(substr) > 1: rest_of_phrase = ' '.join(substr[0:len(substr) - 1]) rest_of_phrase_length = len(rest_of_phrase) + 1 else: rest_of_phrase_length = 0 m = re.search('([0-9]{1,2})', prevtok) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span(0)[0] abs_Espan = ref_Sspan + rest_of_phrase_length + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(prevtok) if texNumVal is not None: abs_Sspan = ref_Sspan + rest_of_phrase_length abs_Espan = ref_Sspan + rest_of_phrase_length + len( prevtok ) if has_space else ref_Sspan + rest_of_phrase_length + len( prevtok) - 1 #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr if idxstart > 0: ## get the absolute span of the interval token abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend ## purposfully split on a single space substr = s.getText()[0:idxstart] # test to see if last character is a space and set a flag. has_space = True if substr[len(substr) - 1] == ' ' else False substr = substr.strip(' ').split(' ') ## get the previous token prevtok = substr[len(substr) - 1] prev_sSpan = idxstart - len( prevtok) - 1 if has_space else idxstart - len(prevtok) prev_eSpan = idxstart - 1 ## get the rest of the substring joined by a space if len(substr) > 1: rest_of_phrase = ' '.join(substr[0:len(substr) - 1]) rest_of_phrase_length = len(rest_of_phrase) + 1 else: rest_of_phrase_length = 0 ## now calculate the relative span of prevtok #rel_Sspan = rest_of_phrase_length #rel_Espan = rest_of_phrase_length + len(prevtok) m = re.search('([0-9]{1,2})', prevtok) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span( 0)[0] abs_Espan = ref_Sspan + rest_of_phrase_length + m.span( 0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(prevtok) if texNumVal is not None: abs_Sspan = ref_Sspan + rest_of_phrase_length abs_Espan = ref_Sspan + rest_of_phrase_length + len( prevtok ) if has_space else ref_Sspan + rest_of_phrase_length + len( prevtok) - 1 #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id