def buildThis(s, chrono_id, chrono_list): # convert to lowercase text = s.getText().lower() # remove all punctuation text_norm = text.translate( str.maketrans(string.punctuation, " " * len(string.punctuation))).strip() # convert to list text_list = text_norm.split(" ") ## find the word "now" as a single token for tok in text_list: if tok == "now": ## get start end coordinates in original temporal phrase start_idx, end_idx = re.search("now", text).span(0) ref_startSpan, ref_endSpan = s.getSpan() ## create a This entity chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=ref_startSpan + start_idx, end_span=ref_startSpan + end_idx) chrono_id = chrono_id + 1 chrono_list.append(chrono_this_entity) elif tok == "today" or tok == "todays": start_idx, end_idx = re.search("today", text).span(0) ref_startSpan, ref_endSpan = s.getSpan() ## create a This entity chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=ref_startSpan + start_idx, end_span=ref_startSpan + end_idx) chrono_id = chrono_id + 1 chrono_interval_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=ref_startSpan + start_idx, end_span=ref_startSpan + end_idx, calendar_type="Day", number=None) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval( chrono_interval_entity.get_id()) chrono_list.append(chrono_this_entity) chrono_list.append(chrono_interval_entity) ## Note, may need to look for phrases like "current week" at some point. elif tok == "current": ## get start end coordinates in original temporal phrase start_idx, end_idx = re.search("current", text).span(0) ref_startSpan, ref_endSpan = s.getSpan() ## create a This entity chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=ref_startSpan + start_idx, end_span=ref_startSpan + end_idx) chrono_id = chrono_id + 1 chrono_list.append(chrono_this_entity) return chrono_list, chrono_id
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText()) boo, val, idxstart, idxend, plural = hasPeriodInterval(s) # FIND terms that are always marked as calendar intervals! if boo and re.search( "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week", s.getText()): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 if re.search("yesterday|yesterdays", s.getText()): my_last_entity = chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=str(chrono_id - 1) + "entity") chrono_id = chrono_id + 1 chrono_list.append(my_last_entity) chrono_list.append(my_entity) # FIND terms that are always marked as periods! elif boo and val == "Unknown": abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=val, number=None) chrono_id = chrono_id + 1 chrono_list.append(my_entity) elif boo: abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token #ref_idx = -1 #for i in range(0,len(ref_list)): # if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))): # ref_idx = i # break ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 #check to see if it has a number associated with it. We assume the number comes before the interval string if idxstart > 0: substr = s.getText()[0:idxstart] m = re.search('([0-9]{1,2})', substr) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(substr) if texNumVal is not None: #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=ref_Sspan, end_span=ref_Sspan + (idxstart - 1), value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr substr = s.getText( )[:idxstart] ## extract entire first part of TimePhrase phrase m = re.search( '([0-9]{1,2})', substr ) #search for an integer in the subphrase and extract it's coordinates if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(numstr) if texNumVal is not None: m = re.search( numstr, substr) #search for the number string in the subphrase if m is not None: abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() boo, val, idxstart, idxend, plural = hasPeriodInterval(s) # FIND terms that are always marked as calendar intervals! if boo and re.search( "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week", s.getText()): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 if re.search("yesterday|yesterdays", s.getText()): my_last_entity = chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=str(chrono_id - 1) + "entity") chrono_id = chrono_id + 1 chrono_list.append(my_last_entity) chrono_list.append(my_entity) # FIND terms that are always marked as periods! elif boo and val == "Unknown": abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=val, number=None) chrono_id = chrono_id + 1 chrono_list.append(my_entity) elif boo: abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 #check to see if it has a number associated with it. We assume the number comes before the interval string #to figure out if the number we find is close to the interval token the end of the number string needs to be within 2 characters of the start of the interval token. #I tried just extracting the previous reference token, but that doesn't work because phrases like "42-year-old" are actually one reference token. # So I decided I had to do it the hard way with index arithmetic. The one concern about this method is that I assume there is a space at the end. This could cause some issues down the line. # Yep, we are getting the spans wrong for phrases like "six-months". I am going to test for a space as the last character before just assuming there was one. if idxstart > 0: ## get the absolute span of the interval token abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend ## purposfully split on a single space substr = s.getText()[0:idxstart] # test to see if last character is a space and set a flag. has_space = True if substr[len(substr) - 1] == ' ' else False substr = substr.strip(' ').split(' ') ## get the previous token prevtok = substr[len(substr) - 1] prev_sSpan = idxstart - len( prevtok) - 1 if has_space else idxstart - len(prevtok) prev_eSpan = idxstart - 1 ## get the rest of the substring joined by a space if len(substr) > 1: rest_of_phrase = ' '.join(substr[0:len(substr) - 1]) rest_of_phrase_length = len(rest_of_phrase) + 1 else: rest_of_phrase_length = 0 m = re.search('([0-9]{1,2})', prevtok) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span(0)[0] abs_Espan = ref_Sspan + rest_of_phrase_length + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(prevtok) if texNumVal is not None: abs_Sspan = ref_Sspan + rest_of_phrase_length abs_Espan = ref_Sspan + rest_of_phrase_length + len( prevtok ) if has_space else ref_Sspan + rest_of_phrase_length + len( prevtok) - 1 #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr if idxstart > 0: ## get the absolute span of the interval token abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend ## purposfully split on a single space substr = s.getText()[0:idxstart] # test to see if last character is a space and set a flag. has_space = True if substr[len(substr) - 1] == ' ' else False substr = substr.strip(' ').split(' ') ## get the previous token prevtok = substr[len(substr) - 1] prev_sSpan = idxstart - len( prevtok) - 1 if has_space else idxstart - len(prevtok) prev_eSpan = idxstart - 1 ## get the rest of the substring joined by a space if len(substr) > 1: rest_of_phrase = ' '.join(substr[0:len(substr) - 1]) rest_of_phrase_length = len(rest_of_phrase) + 1 else: rest_of_phrase_length = 0 ## now calculate the relative span of prevtok #rel_Sspan = rest_of_phrase_length #rel_Espan = rest_of_phrase_length + len(prevtok) m = re.search('([0-9]{1,2})', prevtok) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span( 0)[0] abs_Espan = ref_Sspan + rest_of_phrase_length + m.span( 0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(prevtok) if texNumVal is not None: abs_Sspan = ref_Sspan + rest_of_phrase_length abs_Espan = ref_Sspan + rest_of_phrase_length + len( prevtok ) if has_space else ref_Sspan + rest_of_phrase_length + len( prevtok) - 1 #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id