def hasTextYear(tpentity): #remove ending punctuation text1 = tpentity.getText().strip(",.") #replace all other punctuation and replace with spaces text = text1.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) #make sure it is all letters m = re.search('[a-z,A-Z,-,\s]*', text) if m.group(0) is not '': ##split on spaces tokenized_text = WhitespaceTokenizer().tokenize(text) for t in tokenized_text: if utils.getNumberFromText(t) is None: return False, None, None, None val = utils.getNumberFromText(text) if val is not None: if val >= 1500 and val <= 2050: r = re.search(text1, tpentity.getText()) start, end = r.span(0) return True, val, start, end else: return False, None, None, None else: return False, None, None, None return False, None, None, None
def hasDateOrTime(text): punct = "!\"#$%&\'()*+,-/:;<=>?@[]^_`{|}~" text_norm = text.translate(str.maketrans(punct, ' ' * len(punct))).strip() #convert to list text_list = text_norm.split(' ') #loop through list looking for expression for text in text_list: if len(text) == 4: num = utils.getNumberFromText(text) if (num >= 1800) and (num <= 2050): ## for 4 digit years, but not all 4 digit numbers will be temporal. I set a specific range for 4-digit years. return True if len(text) == 6: ## could be yymmdd or mmddyy ## possible ranges for the year: 00 - 99 ## possible ranges for the month: 01-12 ## possible ranges for the day: 01-31 ## It will be hard to narrow down these ranges at this point without context. return True if len(text) == 8: return True return False
def hasYear(tpentity, flags): text_lower = tpentity.getText().lower() # remove all punctuation text_norm = text_lower.translate(str.maketrans(",", ' ')).strip() # convert to list text_list = text_norm.split(" ") if len(text_list) > 0: # loop through list looking for expression for text in text_list: # get start coordinate of this token in the full string so we can calculate the position of the temporal matches. text_start, text_end = Chrono.utils.calculateSpan(text_norm, text) result = re.search( '([0-9]{1,2})[-/:]([0-9]{1,2}|[A-Za-z]{3,4})[-/:]([0-9]{4})', text) # define regular expression to find a 4-digit year from the date format if result: result = result.group(0) split_result = re.split('[/:-]', result) if len(split_result) == 3: start_idx, end_idx = Chrono.utils.calculateSpan( text, split_result[2]) return True, split_result[ 2], text_start + start_idx, text_start + end_idx, flags else: return False, None, None, None, flags ## look for year at start of date ## added by Amy Olex elif len(text) > 7: result = re.search( '([0-9]{4})[-/:]([0-9]{1,2}|[A-Za-z]{3,4})[-/:]([0-9]{1,2})', text) if result: result = result.group(0) split_result = re.split('[/:-]', result) if len(split_result) == 3: start_idx, end_idx = Chrono.utils.calculateSpan( result, split_result[0]) return True, split_result[ 0], text_start + start_idx, text_start + end_idx, flags else: return False, None, None, None, flags ## special case to look for c.yyyy elif len(text) == 6: result = re.search("c\.([0-9]{4})", text) if result: rval = utils.getNumberFromText(result.group(1)) if rval: if rval >= 1500 and rval <= 2050: start_idx, end_idx = result.span(1) return True, rval, start_idx, end_idx, flags return False, None, None, None, flags # if no 4 digit year expressions were found return false else: return False, None, None, None, flags # if the text_list does not have any entries, return false
def extract_numeric_feature(reftok_list, reftok_idx, obs_list): ## identify numeric feature before = max(reftok_idx - 1, 0) after = min(reftok_idx + 1, len(reftok_list) - 1) if (before != reftok_idx and isinstance( utils.getNumberFromText(reftok_list[before].getText()), (int))): obs_list.update({'feat_numeric': 1}) return (obs_list) elif (after != reftok_idx and isinstance(utils.getNumberFromText(reftok_list[after].getText()), (int))): obs_list.update({'feat_numeric': 1}) return (obs_list) else: obs_list.update({'feat_numeric': 0}) return (obs_list)
def has24HourTime(text): punct = "!\"#$%&\'()*+,-/:;<=>?@[]^_`{|}~" text_norm = text.translate(str.maketrans(punct, ' ' * len(punct))).strip() #convert to list text_list = text_norm.split(' ') #loop through list looking for expression for text in text_list: if len(text) == 4: num = utils.getNumberFromText(text) if num is not None: hour = utils.getNumberFromText(text[:2]) minute = utils.getNumberFromText(text[2:]) if (hour is not None) and (minute is not None): if (minute >= 60) or (hour >= 24): return False else: return True return False
def has24HourTime(tpentity, flags): # text_lower = tpentity.getText().lower() # remove all punctuation # text_norm = text_lower.translate(str.maketrans("", "", ",")) # convert to list stext = tpentity.getText() text_list = stext.split(" ") if not flags["loneDigitYear"]: # loop through list looking for expression for text in text_list: tz_format = re.search( '\d{0,4}(AST|EST|EDT|CST|CDT|MST|MDT|PST|PDT|AKST|HST|HAST|HADT|SST|SDT|GMT|CHST|UTC)', text) if len(text) == 4: num = utils.getNumberFromText(text) if num is not None: hour = utils.getNumberFromText(text[:2]) minute = utils.getNumberFromText(text[2:]) if (hour is not None) and (minute is not None): if (minute > 60) or (hour > 24): return False, None, None, None else: start_idx, end_idx = calculateSpan(stext, text) return True, text, start_idx, end_idx elif tz_format is not None: time = tz_format[0] hour = utils.getNumberFromText(time[0:2]) minute = utils.getNumberFromText(time[2:4]) # if (minute > 60) or (hour > 24): # return False, None, None, None # else: start_idx, end_idx = calculateSpan(stext, time) return True, time, start_idx, end_idx return False, None, None, None # if no 4 digit year expressions were found return false else: return False, None, None, None # if loneDigitYearFlag has already been set
def hasEmbeddedPeriodInterval(tpentity): # convert to all lower # text_lower = tpentity.getText().lower() text = tpentity.getText() # remove all punctuation text_norm = text.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) # convert to list text_list = text_norm.split(" ") # define my period/interval term lists terms = [ "day", "week", "month", "hour", "days", "weeks", "months", "hours", "hrs" ] #, "date"] ## if the term does not exist by itself it may be a substring. Go through each word in the TimePhrase string and see if a substring matches. for t in text_list: for r in terms: ## see if r is a substring of t ## if yes and the substring is at the end, extract the first substring and test to see if it is a number. idx = t.find(r) if (idx > 0): # then the r term is not the first substring. Extract and test. sub1 = t[:idx] sub2 = t[idx:] # sub1 should be a number if (isinstance(utils.getNumberFromText(sub1), (int))): # if it is a number then test to figure out what sub2 is. this_term = sub2 start_idx, end_idx = calculateSpan(text_norm, this_term) if this_term in ["day", "daily", "days"]: #print("ACK! Found an Embedded Day") return True, "Day", start_idx, end_idx, sub1 elif this_term in ["week", "weekly", "weeks"]: return True, "Week", start_idx, end_idx, sub1 elif this_term in ["month", "monthly", "months"]: return True, "Month", start_idx, end_idx, sub1 elif this_term in ["hour", "hourly", "hours"]: return True, "Hour", start_idx, end_idx, sub1 else: return False, None, None, None, None return False, None, None, None, None
def extract_bow_features(reftok_list, reftok_idx, window, obs_dict, obs_list): ## identify bow feature #this_bow = {} start = max(reftok_idx - window, 0) end = min(reftok_idx + (window + 1), len(reftok_list) - 1) for r in range(start, end): if r != reftok_idx: num_check = utils.getNumberFromText(reftok_list[r].getText()) if (isinstance(num_check, (int))): #this_bow[num_check] = 1 obs_list.update({num_check: 1}) obs_dict.update({num_check: 0}) else: #this_bow[reftok_list[r].getText()] = 1 obs_list.update({reftok_list[r].getText(): 1}) obs_dict.update({reftok_list[r].getText(): 0}) #print(str(this_bow)) return (obs_list, obs_dict)
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText()) boo, val, idxstart, idxend, plural = hasPeriodInterval(s) # FIND terms that are always marked as calendar intervals! if boo and re.search( "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week", s.getText()): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 if re.search("yesterday|yesterdays", s.getText()): my_last_entity = chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=str(chrono_id - 1) + "entity") chrono_id = chrono_id + 1 chrono_list.append(my_last_entity) chrono_list.append(my_entity) # FIND terms that are always marked as periods! elif boo and val == "Unknown": abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=val, number=None) chrono_id = chrono_id + 1 chrono_list.append(my_entity) elif boo: abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token #ref_idx = -1 #for i in range(0,len(ref_list)): # if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))): # ref_idx = i # break ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 #check to see if it has a number associated with it. We assume the number comes before the interval string if idxstart > 0: substr = s.getText()[0:idxstart] m = re.search('([0-9]{1,2})', substr) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(substr) if texNumVal is not None: #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=ref_Sspan, end_span=ref_Sspan + (idxstart - 1), value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr substr = s.getText( )[:idxstart] ## extract entire first part of TimePhrase phrase m = re.search( '([0-9]{1,2})', substr ) #search for an integer in the subphrase and extract it's coordinates if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(numstr) if texNumVal is not None: m = re.search( numstr, substr) #search for the number string in the subphrase if m is not None: abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id
def buildNumericDate(s, chrono_id, chrono_list, flags): # convert to all lower text_lower = s.getText().lower() # remove all punctuation # text_norm = text_lower.translate(str.maketrans("", "", string.punctuation)) # print("After:" + text_norm) # convert to list text_norm = text_lower.strip(".,") text_list = text_norm.split(" ") for text in text_list: ## See if there is a 4 digit number and assume it is a year if between 1500 and 2050 ## Note that 24hour times in this range will be interpreted as years. However, if a timezone like 1800EDT is attached it will not be parsed here. if len(text) == 4: num = utils.getNumberFromText(text) if num is not None: if (num >= 1500) and (num <= 2050) and not flags[ "fourdigityear"] and not flags["loneDigitYear"]: flags["loneDigitYear"] = True # print("Found Lone Digit Year") ## build year ref_StartSpan, ref_EndSpan = s.getSpan() start_idx, end_idx = re.search(text, s.getText()).span(0) chrono_year_entity = chrono.ChronoYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + start_idx, end_span=ref_StartSpan + end_idx, value=num) chrono_id = chrono_id + 1 chrono_list.append(chrono_year_entity) ## parse out the condesnsed date format like 19980303 or 03031998. elif len(text) == 8 and utils.getNumberFromText(text) is not None: # Identify format yyyymmdd y = utils.getNumberFromText(text[0:4]) m = utils.getNumberFromText(text[4:6]) d = utils.getNumberFromText(text[6:8]) if y is not None: if (y >= 1500) and (y <= 2050) and (m <= 12) and (d <= 31): ref_StartSpan, ref_EndSpan = s.getSpan() # add year chrono_year_entity = chrono.ChronoYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan, end_span=ref_StartSpan + 4, value=y) chrono_id = chrono_id + 1 # add month chrono_month_entity = chrono.chronoMonthOfYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 4, end_span=ref_StartSpan + 6, month_type=calendar.month_name[m]) chrono_id = chrono_id + 1 chrono_year_entity.set_sub_interval( chrono_month_entity.get_id()) # add day chrono_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 6, end_span=ref_StartSpan + 8, value=d) chrono_id = chrono_id + 1 chrono_month_entity.set_sub_interval( chrono_day_entity.get_id()) chrono_list.append(chrono_year_entity) chrono_list.append(chrono_month_entity) chrono_list.append(chrono_day_entity) else: # test for mmddyyyy y2 = utils.getNumberFromText(text[4:8]) m2 = utils.getNumberFromText(text[0:2]) d2 = utils.getNumberFromText(text[2:4]) if y2 is not None: if (y2 >= 1500) and (y2 <= 2050) and (m2 <= 12) and ( d2 <= 31): ref_StartSpan, ref_EndSpan = s.getSpan() # add year chrono_year_entity = chrono.ChronoYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 4, end_span=ref_StartSpan + 8, value=y) chrono_id = chrono_id + 1 # add month chrono_month_entity = chrono.chronoMonthOfYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan, end_span=ref_StartSpan + 2, month_type=calendar.month_name[m2]) chrono_id = chrono_id + 1 chrono_year_entity.set_sub_interval( chrono_month_entity.get_id()) # add day chrono_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 2, end_span=ref_StartSpan + 4, value=d) chrono_id = chrono_id + 1 chrono_month_entity.set_sub_interval( chrono_day_entity.get_id()) chrono_list.append(chrono_year_entity) chrono_list.append(chrono_month_entity) chrono_list.append(chrono_day_entity) ## parse out the condesnsed date format like 030399 or 990303. ## Note that dates such as 12-01-2006 (120106 vs 061201) and similar are not distinguishable. elif len(text) == 6 and utils.getNumberFromText(text) is not None: # Identify format mmddyy y = utils.getNumberFromText(text[4:6]) m = utils.getNumberFromText(text[0:2]) d = utils.getNumberFromText(text[2:4]) if y is not None and m is not None and d is not None: if (m <= 12) and (d <= 31): ref_StartSpan, ref_EndSpan = s.getSpan() # add year chrono_year_entity = chrono.ChronoTwoDigitYearOperator( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 4, end_span=ref_StartSpan + 6, value=y) chrono_id = chrono_id + 1 # add month chrono_month_entity = chrono.chronoMonthOfYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan, end_span=ref_StartSpan + 2, month_type=calendar.month_name[m]) chrono_id = chrono_id + 1 chrono_year_entity.set_sub_interval( chrono_month_entity.get_id()) # add day chrono_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 2, end_span=ref_StartSpan + 4, value=d) chrono_id = chrono_id + 1 chrono_month_entity.set_sub_interval( chrono_day_entity.get_id()) chrono_list.append(chrono_year_entity) chrono_list.append(chrono_month_entity) chrono_list.append(chrono_day_entity) else: # test for yymmdd y2 = utils.getNumberFromText(text[0:2]) m2 = utils.getNumberFromText(text[2:4]) d2 = utils.getNumberFromText(text[4:6]) if y2 is not None: if (m2 <= 12) and (d2 <= 31): ref_StartSpan, ref_EndSpan = s.getSpan() # add year chrono_year_entity = chrono.ChronoTwoDigitYearOperator( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan, end_span=ref_StartSpan + 2, value=y2) chrono_id = chrono_id + 1 # add month chrono_month_entity = chrono.chronoMonthOfYearEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 2, end_span=ref_StartSpan + 4, month_type=calendar.month_name[m2]) chrono_id = chrono_id + 1 chrono_year_entity.set_sub_interval( chrono_month_entity.get_id()) # add day chrono_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=ref_StartSpan + 4, end_span=ref_StartSpan + 6, value=d2) chrono_id = chrono_id + 1 chrono_month_entity.set_sub_interval( chrono_day_entity.get_id()) chrono_list.append(chrono_year_entity) chrono_list.append(chrono_month_entity) chrono_list.append(chrono_day_entity) return chrono_list, chrono_id, flags #### # END_MODULE ####
def buildAMPM(s, chrono_id, chrono_list, flags): am_flag = True ref_Sspan, ref_Espan = s.getSpan() ## Identify if a time zone string exists # tz = hasTimeZone(s) # if tz is not None: # my_tz_entity = chrono.ChronoTimeZoneEntity(str(chrono_id) + "entity", start_span =tz.span(0)[0] + ref_Sspan, end_span=tz.span(0)[1] + ref_Sspan) # chrono_list.append(my_tz_entity) # chrono_id = chrono_id + 1 # else: # my_tz_entity = None boo, val, idxstart, idxend = hasAMPM(s) if boo: if val == "PM": am_flag = False abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_AMPM_entity = chrono.ChronoAMPMOfDayEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, ampm_type=val) chrono_id = chrono_id + 1 chrono_list.append(my_AMPM_entity) print("In AMPM") #check to see if it has a time associated with it. We assume the time comes before the AMPM string #We could parse out the time from the TimePhrase normalized value. The problem is getting the correct span. #idx_start is the first index of the ampm. If there are any characters before it, it will be greater than 0. if idxstart > 0 and not flags['hour']: substr = s.getText()[0:idxstart] m = re.search('([0-9]{1,4})', substr) if m is not None : time_val = m.group(0) if len(time_val) <=2: if int(time_val) <= 12: abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] #print("Adding Hour in AMPM") my_hour_entity = chrono.ChronoHourOfDayEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=time_val, ampm=my_AMPM_entity.get_id()) chrono_id = chrono_id + 1 chrono_list.append(my_hour_entity) flags["hour"] = True elif len(time_val) == 3: print("My Time_val: " + time_val) k = re.search('([0-9])([0-9]{2})', time_val) print("K0: " + k.group(0)) print("K1: " + k.group(1)) print("K2: " + k.group(2)) if int(k.group(2)) < 60: abs_Sspan1 = ref_Sspan + k.span(2)[0] abs_Espan1 = ref_Sspan + k.span(2)[1] print("Adding Minute in AMPM") my_minute_entity = chrono.ChronoMinuteOfHourEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan1, end_span=abs_Espan1, value=k.group(2)) chrono_id = chrono_id + 1 chrono_list.append(my_minute_entity) flags["minute"] = True if int(k.group(1)) <= 12: abs_Sspan = ref_Sspan + k.span(1)[0] abs_Espan = ref_Sspan + k.span(1)[1] print("Adding Hour in AMPM") my_hour_entity = chrono.ChronoHourOfDayEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=k.group(1), ampm=my_AMPM_entity.get_id(), sub_interval=my_minute_entity) chrono_id = chrono_id + 1 chrono_list.append(my_hour_entity) flags["hour"] = True elif len(time_val) == 4: k = re.search('([0-9]{2})([0-9]{2})', time_val) if int(k.group(2)) < 60: abs_Sspan1 = ref_Sspan + k.span(2)[0] abs_Espan1 = ref_Sspan + k.span(2)[1] print("Adding Minute in AMPM") my_minute_entity = chrono.ChronoMinuteOfHourEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan1, end_span=abs_Espan1, value=k.group(2)) chrono_id = chrono_id + 1 chrono_list.append(my_minute_entity) flags["minute"] = True if int(k.group(1)) <= 12: abs_Sspan = ref_Sspan + k.span(1)[0] abs_Espan = ref_Sspan + k.span(1)[1] print("Adding Hour in AMPM") my_hour_entity = chrono.ChronoHourOfDayEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=k.group(1), ampm=my_AMPM_entity.get_id(), sub_interval=my_minute_entity) chrono_id = chrono_id + 1 chrono_list.append(my_hour_entity) flags["hour"] = True #else search for a text number else: texNumVal = utils.getNumberFromText(substr) if texNumVal is not None: #create the hour entity if not flags['hour']: my_hour_entity = chrono.ChronoHourOfDayEntity(entityID=str(chrono_id) + "entity", start_span=ref_Sspan, end_span=ref_Sspan + (idxstart - 1), value=texNumVal, ampm=my_AMPM_entity.get_id()) chrono_id = chrono_id + 1 chrono_list.append(my_hour_entity) flags["hour"] = True return chrono_list, chrono_id
def buildAMPM(s, chrono_id, chrono_list, flags): am_flag = True ref_Sspan, ref_Espan = s.getSpan() ## Identify if a time zone string exists # tz = hasTimeZone(s) # if tz is not None: # my_tz_entity = chrono.ChronoTimeZoneEntity(str(chrono_id) + "entity", start_span =tz.span(0)[0] + ref_Sspan, end_span=tz.span(0)[1] + ref_Sspan) # chrono_list.append(my_tz_entity) # chrono_id = chrono_id + 1 # else: # my_tz_entity = None boo, val, idxstart, idxend = hasAMPM(s) if boo: if val == "PM": am_flag = False abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_AMPM_entity = chrono.ChronoAMPMOfDayEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, ampm_type=val) chrono_id = chrono_id + 1 chrono_list.append(my_AMPM_entity) #check to see if it has a time associated with it. We assume the time comes before the AMPM string #We could parse out the time from the TimePhrase normalized value. The problem is getting the correct span. #idx_start is the first index of the ampm. If there are any characters before it, it will be greater than 0. if idxstart > 0 and not flags['hour']: substr = s.getText()[0:idxstart] m = re.search('([0-9]{1,2})', substr) if m is not None: hour_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_hour_entity = chrono.ChronoHourOfDayEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=hour_val, ampm=my_AMPM_entity.get_id()) chrono_id = chrono_id + 1 chrono_list.append(my_hour_entity) #else search for a text number else: texNumVal = utils.getNumberFromText(substr) if texNumVal is not None: #create the hour entity if not flags['hour']: my_hour_entity = chrono.ChronoHourOfDayEntity( entityID=str(chrono_id) + "entity", start_span=ref_Sspan, end_span=ref_Sspan + (idxstart - 1), value=texNumVal, ampm=my_AMPM_entity.get_id()) chrono_id = chrono_id + 1 chrono_list.append(my_hour_entity) return chrono_list, chrono_id
def buildTextMonthAndDay(s, chrono_id, chrono_list, flags, dct=None, ref_list=None): boo, val, idxstart, idxend = hasTextMonth(s, ref_list) if boo and not flags["month"]: flags["month"] = True ref_Sspan, ref_Espan = s.getSpan() abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_month_entity = chrono.chronoMonthOfYearEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, month_type=val) chrono_id = chrono_id + 1 ## assume all numbers 1-31 are days ## assume all numbers >1000 are years ## parse all text before month ## test to see if all text is a number or text year ## if no: ## remove all punctuation ## seperate by spaces ## parse each token, if find a number then assign to day or year as appropriate ## if yes: ## assign to day or year as appropriate ## parse all text after month ## test to see if all text is a number or text year ## if no: ## remove all punctuation ## seperate by spaces ## parse each token, if find a number then assign to day or year as appropriate ## if yes: ## assign to day or year as appropriate #idx_end is the last index of the month. If there are any characters after it the length of the string will be greater than the endidx. if (idxend < len(s.getText())): substr = s.getText()[idxend:].strip(",.").strip() num = utils.getNumberFromText(substr) if num is not None: if num <= 31 and not flags["day"]: flags["day"] = True day_startidx, day_endidx = calculateSpan( s.getText(), str(num)) #substr) abs_Sspan = ref_Sspan + day_startidx abs_Espan = ref_Sspan + day_endidx my_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num) chrono_list.append(my_day_entity) chrono_id = chrono_id + 1 #now figure out if it is a NEXT or LAST #create doctime if False: #dct is not None: mStart = my_month_entity.get_start_span() mEnd = my_month_entity.get_end_span() this_dct = datetime.datetime( int(dct.year), int( utils.getMonthNumber( my_month_entity.get_month_type())), int(my_day_entity.get_value()), 0, 0) if this_dct > dct: chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=my_month_entity.get_id( ))) chrono_id = chrono_id + 1 elif this_dct < dct: chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=my_month_entity.get_id( ))) chrono_id = chrono_id + 1 elif num >= 1500 and num <= 2050 and not flags[ "fourdigityear"] and not flags["loneDigitYear"]: flags["fourdigityear"] = True year_startidx, year_endidx = calculateSpan( s.getText(), substr) abs_Sspan = ref_Sspan + year_startidx abs_Espan = ref_Sspan + year_endidx my_year_entity = chrono.ChronoYearEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num) chrono_list.append(my_year_entity) my_year_entity.set_sub_interval(my_month_entity.get_id()) chrono_id = chrono_id + 1 else: ##parse and process each token ##replace punctuation substr = substr.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) ##split on spaces tokenized_text = WhitespaceTokenizer().tokenize(substr) for i in range(0, len(tokenized_text)): num = utils.getNumberFromText(tokenized_text[i]) if num is not None: if num <= 31: day_startidx, day_endidx = calculateSpan( s.getText(), tokenized_text[i]) abs_Sspan = ref_Sspan + day_startidx abs_Espan = ref_Sspan + day_endidx my_day_entity = chrono.ChronoDayOfMonthEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num) chrono_list.append(my_day_entity) chrono_id = chrono_id + 1 #now figure out if it is a NEXT or LAST #create doctime if False: #dct is not None: mStart = my_month_entity.get_start_span() mEnd = my_month_entity.get_end_span() this_dct = datetime.datetime( int(dct.year), int( utils.getMonthNumber( my_month_entity.get_month_type())), int(my_day_entity.get_value()), 0, 0) if this_dct > dct: chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=my_month_entity. get_id())) chrono_id = chrono_id + 1 elif this_dct < dct: chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=mStart, end_span=mEnd, repeating_interval=my_month_entity. get_id())) chrono_id = chrono_id + 1 elif num >= 1500 and num <= 2050 and not flags[ "fourdigityear"] and not flags["loneDigitYear"]: flags["fourdigityear"] = True year_startidx, year_endidx = calculateSpan( s.getText(), tokenized_text[i]) abs_Sspan = ref_Sspan + year_startidx abs_Espan = ref_Sspan + year_endidx my_year_entity = chrono.ChronoYearEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num) chrono_list.append(my_year_entity) my_year_entity.set_sub_interval( my_month_entity.get_id()) chrono_id = chrono_id + 1 ## if the start of the month is not 0 then we have leading text to parse if (idxstart > 0): #substr = s.getText()[:idxstart].strip(",.").strip() hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "This": chrono_list.append( chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_month_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_month_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": # print("FOUND LAST") chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_month_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 chrono_list.append(my_month_entity) return chrono_list, chrono_id, flags
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() boo, val, idxstart, idxend, plural = hasPeriodInterval(s) # FIND terms that are always marked as calendar intervals! if boo and re.search( "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week", s.getText()): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 if re.search("yesterday|yesterdays", s.getText()): my_last_entity = chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=str(chrono_id - 1) + "entity") chrono_id = chrono_id + 1 chrono_list.append(my_last_entity) chrono_list.append(my_entity) # FIND terms that are always marked as periods! elif boo and val == "Unknown": abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=val, number=None) chrono_id = chrono_id + 1 chrono_list.append(my_entity) elif boo: abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 #check to see if it has a number associated with it. We assume the number comes before the interval string #to figure out if the number we find is close to the interval token the end of the number string needs to be within 2 characters of the start of the interval token. #I tried just extracting the previous reference token, but that doesn't work because phrases like "42-year-old" are actually one reference token. # So I decided I had to do it the hard way with index arithmetic. The one concern about this method is that I assume there is a space at the end. This could cause some issues down the line. # Yep, we are getting the spans wrong for phrases like "six-months". I am going to test for a space as the last character before just assuming there was one. if idxstart > 0: ## get the absolute span of the interval token abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend ## purposfully split on a single space substr = s.getText()[0:idxstart] # test to see if last character is a space and set a flag. has_space = True if substr[len(substr) - 1] == ' ' else False substr = substr.strip(' ').split(' ') ## get the previous token prevtok = substr[len(substr) - 1] prev_sSpan = idxstart - len( prevtok) - 1 if has_space else idxstart - len(prevtok) prev_eSpan = idxstart - 1 ## get the rest of the substring joined by a space if len(substr) > 1: rest_of_phrase = ' '.join(substr[0:len(substr) - 1]) rest_of_phrase_length = len(rest_of_phrase) + 1 else: rest_of_phrase_length = 0 m = re.search('([0-9]{1,2})', prevtok) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span(0)[0] abs_Espan = ref_Sspan + rest_of_phrase_length + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(prevtok) if texNumVal is not None: abs_Sspan = ref_Sspan + rest_of_phrase_length abs_Espan = ref_Sspan + rest_of_phrase_length + len( prevtok ) if has_space else ref_Sspan + rest_of_phrase_length + len( prevtok) - 1 #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr if idxstart > 0: ## get the absolute span of the interval token abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend ## purposfully split on a single space substr = s.getText()[0:idxstart] # test to see if last character is a space and set a flag. has_space = True if substr[len(substr) - 1] == ' ' else False substr = substr.strip(' ').split(' ') ## get the previous token prevtok = substr[len(substr) - 1] prev_sSpan = idxstart - len( prevtok) - 1 if has_space else idxstart - len(prevtok) prev_eSpan = idxstart - 1 ## get the rest of the substring joined by a space if len(substr) > 1: rest_of_phrase = ' '.join(substr[0:len(substr) - 1]) rest_of_phrase_length = len(rest_of_phrase) + 1 else: rest_of_phrase_length = 0 ## now calculate the relative span of prevtok #rel_Sspan = rest_of_phrase_length #rel_Espan = rest_of_phrase_length + len(prevtok) m = re.search('([0-9]{1,2})', prevtok) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span( 0)[0] abs_Espan = ref_Sspan + rest_of_phrase_length + m.span( 0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(prevtok) if texNumVal is not None: abs_Sspan = ref_Sspan + rest_of_phrase_length abs_Espan = ref_Sspan + rest_of_phrase_length + len( prevtok ) if has_space else ref_Sspan + rest_of_phrase_length + len( prevtok) - 1 #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id
def hasEmbeddedPeriodInterval(tpentity): # convert to all lower # text_lower = tpentity.getText().lower() text = tpentity.getText() # remove all punctuation text_norm = text.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) # convert to list text_list = text_norm.split(" ") # define my period/interval term lists print( "TOFIX: PeriodInterval.py @ line 388: convert to using the dictionary." ) terms = [ "decades", "decade", "yesterday", "yesterdays", "today", "todays", "tomorrow", "tomorrows", "day", "week", "month", "year", "daily", "weekly", "monthly", "yearly", "century", "minute", "second", "hour", "hourly", "days", "weeks", "months", "years", "centuries", "century", "minutes", "seconds", "hours", "time", "shortly", "soon", "briefly", "awhile", "future", "lately", "annual", "hr", "hrs", "min", "mins", "quarter" ] #, "date"] ## if the term does not exist by itself it may be a substring. Go through each word in the TimePhrase string and see if a substring matches. for t in text_list: for r in terms: ## see if r is a substring of t ## if yes and the substring is at the end, extract the first substring and test to see if it is a number. idx = t.find(r) if (idx > 0): # then the r term is not the first substring. Extract and test. sub1 = t[:idx] sub2 = t[idx:] # sub1 should be a number if (isinstance(utils.getNumberFromText(sub1), (int))): # if it is a number then test to figure out what sub2 is. this_term = sub2 start_idx, end_idx = calculateSpan(text_norm, this_term) if this_term in [ "day", "daily", "days", "yesterday", "tomorrow", "yesterdays", "tomorrows", "today", "todays" ]: #print("ACK! Found an Embedded Day") return True, "Day", start_idx, end_idx, sub1 elif this_term in ["week", "weekly", "weeks"]: return True, "Week", start_idx, end_idx, sub1 elif this_term in ["month", "monthly", "months"]: return True, "Month", start_idx, end_idx, sub1 elif this_term in ["year", "yearly", "years"]: return True, "Year", start_idx, end_idx, sub1 elif this_term in ["century", "centuries"]: return True, "Century", start_idx, end_idx, sub1 elif this_term in ["decade", "decades"]: return True, "Decade", start_idx, end_idx, sub1 elif this_term in ["minute", "minutes"]: return True, "Minute", start_idx, end_idx, sub1 elif this_term in ["second", "seconds"]: return True, "Second", start_idx, end_idx, sub1 elif this_term in ["hour", "hourly", "hours"]: return True, "Hour", start_idx, end_idx, sub1 elif this_term in [ "time", "shortly", "soon", "briefly", "awhile", "future", "lately" ]: return True, "Unknown", start_idx, end_idx, sub1 else: return False, None, None, None, None return False, None, None, None, None
def buildSeasonOfYear(s, chrono_id, chrono_list, ref_list): boo, val, idxstart, idxend = hasSeasonOfYear(s, ref_list) if boo: ref_Sspan, ref_Espan = s.getSpan() abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoSeasonOfYearEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, season_type=val) chrono_id = chrono_id + 1 #check here to see if it has a modifier hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s) if (hasMod): if mod_type == "This": chrono_list.append( chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 #else: # chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=my_entity.get_id())) # chrono_id = chrono_id + 1 # else: # chrono_list.append(chrono.ChronoLastOperator(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=my_entity.get_id())) # chrono_id = chrono_id+1 #check to see if it has a number associated with it. We assume the number comes before the interval string if idxstart > 0: substr = s.getText()[0:idxstart] m = re.search('([0-9]{1,2})', substr) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(substr) if texNumVal is not None: #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=ref_Sspan, end_span=ref_Sspan + (idxstart - 1), value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id
def buildDoseDuration(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText()) bad = re.compile(r"^q\d|^Q\d") parts = s.getText().split() containsnum = False #various checks to ensure that this phrase is actually a dose duration if isDoseDuration(parts[0]): return chrono_list, chrono_id if "every" in s.getText().lower() or "time" in s.getText().lower( ) or "per" in s.getText().lower(): return chrono_list, chrono_id if bad.match(s.getText()): return chrono_list, chrono_id if "/" in s.getText(): return chrono_list, chrono_id if "[**" in s.getText() or "**]" in s.getText(): return chrono_list, chrono_id if "ly" in s.getText(): return chrono_list, chrono_id if "(" in s.getText() or ")" in s.getText(): return chrono_list, chrono_id if "once" in s.getText().lower() or "twice" in s.getText().lower(): return chrono_list, chrono_id if "past" in s.getText().lower() or "ago" in s.getText().lower(): return chrono_list, chrono_id if "RANDOM" in s.getText(): return chrono_list, chrono_id for part in parts: for ref in ref_list: if ref.getText().lower() == part.lower(): if (ref.isNumeric()): containsnum = True break elif not tt.hasDoseDuration(ref.getText().lower()): return chrono_list, chrono_id if containsnum == False: return chrono_list, chrono_id boo, val, idxstart, idxend, plural = hasDoseDuration(s) if boo: abs_Sspan = ref_Sspan abs_Espan = ref_Sspan + idxend # get index of overlapping reference token # ref_idx = -1 # for i in range(0,len(ref_list)): # if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))): # ref_idx = i # break ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=getDoseDurationValue(val), number=None, text=s.getText()) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=val, number=None, text=s.getText()) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 # check to see if it has a number associated with it. We assume the number comes before the interval string if idxstart > 0: substr = s.getText()[0:idxstart] m = re.search('([0-9]{1,2})', substr) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) #chrono_id = chrono_id + 1 # add the number entity to the list #chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) # else search for a text number else: texNumVal = utils.getNumberFromText(substr) if texNumVal is not None: # create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=ref_Sspan, end_span=ref_Sspan + (idxstart - 1), value=texNumVal) #chrono_id = chrono_id + 1 # append to list #chrono_list.append(my_number_entity) # link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=getDoseDurationValue(val), number=None, text=s.getText()) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=val, number=None, text=s.getText()) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr substr = s.getText( )[:idxstart] ## extract entire first part of TimePhrase phrase m = re.search( '([0-9]{1,2})', substr ) #search for an integer in the subphrase and extract it's coordinates if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) #chrono_id = chrono_id + 1 #add the number entity to the list #chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(numstr) if texNumVal is not None: m = re.search( numstr, substr) #search for the number string in the subphrase if m is not None: abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) #chrono_id = chrono_id + 1 #append to list #chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id