def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText()) boo, val, idxstart, idxend, plural = hasPeriodInterval(s) # FIND terms that are always marked as calendar intervals! if boo and re.search( "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week", s.getText()): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 if re.search("yesterday|yesterdays", s.getText()): my_last_entity = chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=str(chrono_id - 1) + "entity") chrono_id = chrono_id + 1 chrono_list.append(my_last_entity) chrono_list.append(my_entity) # FIND terms that are always marked as periods! elif boo and val == "Unknown": abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=val, number=None) chrono_id = chrono_id + 1 chrono_list.append(my_entity) elif boo: abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token #ref_idx = -1 #for i in range(0,len(ref_list)): # if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))): # ref_idx = i # break ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 #check to see if it has a number associated with it. We assume the number comes before the interval string if idxstart > 0: substr = s.getText()[0:idxstart] m = re.search('([0-9]{1,2})', substr) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(substr) if texNumVal is not None: #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=ref_Sspan, end_span=ref_Sspan + (idxstart - 1), value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr substr = s.getText( )[:idxstart] ## extract entire first part of TimePhrase phrase m = re.search( '([0-9]{1,2})', substr ) #search for an integer in the subphrase and extract it's coordinates if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(numstr) if texNumVal is not None: m = re.search( numstr, substr) #search for the number string in the subphrase if m is not None: abs_Sspan = ref_Sspan + m.span(0)[0] abs_Espan = ref_Sspan + m.span(0)[1] #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id
def createMLTrainingMatrix(infiles, gold_folder, ext="", save=False, output="aquaint_train", window=3): ### Algorithm ## For each input file: ## 1) parse text to refTokens list ## 2) parse SUTime to identify temporal tokens ## 3) Import gold standard file ## 4) Get list of periods and intervals with start and end coords ## 5) For each period/interval: ## - Create feature vector ## - Save features to global list ## 6) Write gold features to a csv file for import by other scripts to train ML methods. ## define list of dictionary feature vectors obs_list = [] ### This is the list of features for each observation category = [ ] ### This is the category of the observation. 1 for period, 0 otherwise. Note that the unknowns are being grouped in with the calendar-interval category. probably need to parse that out later or change up the algorithm to not be a binary classifier. features = { 'feat_numeric': 0, 'feat_temp_context': 0, 'feat_temp_self': 0 } ### This is the full list of features. I will use the key values to get the individual feature vectors. if (save): outfile = open("./gold-standard-parsing.txt", 'w') ## Loop through each file and parse for f in range(0, len(infiles)): print("ML Parsing " + infiles[f] + " ...") ## parse out the doctime doctime = utils.getDocTime(infiles[f] + ".dct") if (debug): print(doctime) ## parse out reference tokens text, tokens, spans, tags = utils.getWhitespaceTokens(infiles[f] + ext) my_refToks = referenceToken.convertToRefTokens(tok_list=tokens, span=spans, pos=tags) ## mark all ref tokens if they are numeric or temporal chroList = utils.markTemporal(my_refToks) ## import gold standard data gold_file = os.path.join(gold_folder, os.path.split(infiles[f])[1], "period-interval.gold.csv") gold_list = [] if not os.path.exists(gold_file): print(gold_file + " DOES NOT EXISTS") break if os.path.exists(gold_file): if (save): outfile.write("\n$$$$$$$$$$$\nProcessing: " + gold_file) with open(gold_file) as file: reader = csv.DictReader(file) for row in reader: gold_list.append({ 'type': row['type'], 'start': row['start'], 'end': row['end'], 'value': row['value'] }) if (save): outfile.write("\n" + str(row)) ## loop through each reftoken term and see if it overlaps with a gold token for r in range(0, len(chroList)): reftok = chroList[r] ref_s, ref_e = reftok.getSpan() # loop through each gold instance and find the one that overlaps with the current reftok. for g in gold_list: # print(str(g)) if utils.overlap( [ref_s, ref_e], [int(g['start']), int(g['end'])]): this_obs = {} # if the gold token overlaps with the current reftok we need to extract the features from the reftok and add it to the list if (save): outfile.write("\nPrevious Token: " + str(chroList[max(r - 1, 0)])) outfile.write("\nTarget Token: " + str(reftok)) #print("Length: "+ str(len(my_refToks)) + "Last: "+str(min(r+1, len(my_refToks)))) outfile.write( "\nNext Token: " + str(chroList[min(r + 1, len(my_refToks) - 1)]) + "\n") ### Identify Temporal features this_obs = extract_temp_features( chroList, r, 3, this_obs) ### Extract all words within a N-word window this_obs, observations = extract_bow_features( chroList, r, window, features, this_obs) ### Determine if there is a numeric before or after the target word. this_obs = extract_numeric_feature( chroList, r, this_obs) ### Stem and extract the actual word this_obs, observations = extract_stem_feature( chroList[r], features, this_obs) ### Get the correct type if (g['type'] == 'Period'): category.append(1) else: category.append(0) obs_list.append(this_obs) ## Ok, I have all the features. Now I just need to put them all together in a matrix. print("features length: " + str(len(features.keys()))) print("obs_list length: " + str(len(obs_list))) print("category length: " + str(len(category))) ## Now I need to loop through the obs_list to create a list of features that contain all feature elements. full_obs_list = [] # a list of tuples for i in range(0, len(obs_list)): feats = deepcopy(features) feats.update(obs_list[i]) #full_obs_list.append((feats, category[i])) full_obs_list.append(feats) ## Now print the list of tuples to a file, then return the list. keys = full_obs_list[0].keys() with open(output + '_data.csv', 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(full_obs_list) with open(output + '_class.csv', 'w') as output_file: for c in category: output_file.write("%s\n" % c) ### Now return the feature list and the categories return (full_obs_list, category)
def buildDoseDuration(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText()) bad = re.compile(r"^q\d|^Q\d") parts = s.getText().split() containsnum = False #various checks to ensure that this phrase is actually a dose duration if isDoseDuration(parts[0]): return chrono_list, chrono_id if "every" in s.getText().lower() or "time" in s.getText().lower( ) or "per" in s.getText().lower(): return chrono_list, chrono_id if bad.match(s.getText()): return chrono_list, chrono_id if "/" in s.getText(): return chrono_list, chrono_id if "[**" in s.getText() or "**]" in s.getText(): return chrono_list, chrono_id if "ly" in s.getText(): return chrono_list, chrono_id if "(" in s.getText() or ")" in s.getText(): return chrono_list, chrono_id if "once" in s.getText().lower() or "twice" in s.getText().lower(): return chrono_list, chrono_id if "past" in s.getText().lower() or "ago" in s.getText().lower(): return chrono_list, chrono_id if "RANDOM" in s.getText(): return chrono_list, chrono_id for part in parts: part = re.sub('[' + string.punctuation + ']', '', part).strip() for ref in ref_list: if ref.getText().lower() == part.lower(): if (ref.isNumeric()): containsnum = True if utils.isOrdinal(ref.getText()): return chrono_list, chrono_id break elif not tt.hasDoseDuration(ref.getText().lower()): return chrono_list, chrono_id if containsnum == False: return chrono_list, chrono_id boo, val, idxstart, idxend, plural = hasDoseDuration(s) if boo: abs_Sspan = ref_Sspan abs_Espan = ref_Espan # get index of overlapping reference token # ref_idx = -1 # for i in range(0,len(ref_list)): # if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))): # ref_idx = i # break ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=getDoseDurationValue(val), number=None, text=s.getText()) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=val, number=None, text=s.getText()) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasModifier(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 # check to see if it has a number associated with it. We assume the number comes before the interval string chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan abs_Espan = ref_Espan # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=getDoseDurationValue(val), number=None, text=s.getText()) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoDoseDurationEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, dose_type=val, number=None, text=s.getText()) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr substr = s.getText( )[:idxstart] ## extract entire first part of TimePhrase phrase m = re.search( '([0-9]{1,2})', substr ) #search for an integer in the subphrase and extract it's coordinates chrono_list.append(my_entity) return chrono_list, chrono_id
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier, feats): features = feats.copy() ref_Sspan, ref_Espan = s.getSpan() boo, val, idxstart, idxend, plural = hasPeriodInterval(s) # FIND terms that are always marked as calendar intervals! if boo and re.search( "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week", s.getText()): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 if re.search("yesterday|yesterdays", s.getText()): my_last_entity = chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, repeating_interval=str(chrono_id - 1) + "entity") chrono_id = chrono_id + 1 chrono_list.append(my_last_entity) chrono_list.append(my_entity) # FIND terms that are always marked as periods! elif boo and val == "Unknown": abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=val, number=None) chrono_id = chrono_id + 1 chrono_list.append(my_entity) elif boo: abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan) # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, feats.copy()) # classify into period or interval if classifier[1] == "NN": my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if my_class == 1: my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_period(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, period=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val, number=None) chrono_id = chrono_id + 1 ### Check to see if this calendar interval has a "this" in front of it prior_tok = ref_list[ref_idx - 1].getText().lower() if prior_tok.translate( str.maketrans(string.punctuation, ' ' * len(string.punctuation))) == "this": # add a This entitiy and link it to the interval. start_span, end_span = re.search(prior_tok, "this").span(0) prior_start, prior_end = ref_list[ref_idx - 1].getSpan() chrono_this_entity = chrono.ChronoThisOperator( entityID=str(chrono_id) + "entity", start_span=prior_start + start_span, end_span=prior_start + end_span) chrono_id = chrono_id + 1 chrono_this_entity.set_repeating_interval(my_entity.get_id()) chrono_list.append(chrono_this_entity) else: # check for a Last Word hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s) if (hasMod): if mod_type == "Next": chrono_list.append( chrono.ChronoNextOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id())) chrono_id = chrono_id + 1 if mod_type == "Last": chrono_list.append( chrono.ChronoLastOperator( entityID=str(chrono_id) + "entity", start_span=ref_Sspan + mod_start, end_span=ref_Sspan + mod_end, repeating_interval=my_entity.get_id(), semantics="Interval-Not-Included")) chrono_id = chrono_id + 1 #check to see if it has a number associated with it. We assume the number comes before the interval string #to figure out if the number we find is close to the interval token the end of the number string needs to be within 2 characters of the start of the interval token. #I tried just extracting the previous reference token, but that doesn't work because phrases like "42-year-old" are actually one reference token. # So I decided I had to do it the hard way with index arithmetic. The one concern about this method is that I assume there is a space at the end. This could cause some issues down the line. # Yep, we are getting the spans wrong for phrases like "six-months". I am going to test for a space as the last character before just assuming there was one. if idxstart > 0: ## get the absolute span of the interval token abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend ## purposfully split on a single space substr = s.getText()[0:idxstart] # test to see if last character is a space and set a flag. has_space = True if substr[len(substr) - 1] == ' ' else False substr = substr.strip(' ').split(' ') ## get the previous token prevtok = substr[len(substr) - 1] prev_sSpan = idxstart - len( prevtok) - 1 if has_space else idxstart - len(prevtok) prev_eSpan = idxstart - 1 ## get the rest of the substring joined by a space if len(substr) > 1: rest_of_phrase = ' '.join(substr[0:len(substr) - 1]) rest_of_phrase_length = len(rest_of_phrase) + 1 else: rest_of_phrase_length = 0 m = re.search('([0-9]{1,2})', prevtok) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span(0)[0] abs_Espan = ref_Sspan + rest_of_phrase_length + m.span(0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(prevtok) if texNumVal is not None: abs_Sspan = ref_Sspan + rest_of_phrase_length abs_Espan = ref_Sspan + rest_of_phrase_length + len( prevtok ) if has_space else ref_Sspan + rest_of_phrase_length + len( prevtok) - 1 #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) else: boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s) if (boo2): abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend # get index of overlapping reference token ref_idx = -1 for i in range(0, len(ref_list)): if (utils.overlap(ref_list[i].getSpan(), (abs_Sspan, abs_Espan))): ref_idx = i break # extract ML features my_features = utils.extract_prediction_features( ref_list, ref_idx, features) # classify into period or interval if (classifier[1] == "NN"): my_class = ChronoKeras.keras_classify( classifier[0], np.array(list(my_features.values()))) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) elif classifier[1] in ("SVM", "RF"): feat_array = [int(i) for i in my_features.values()] my_class = classifier[0].predict([feat_array])[0] else: my_class = classifier[0].classify(my_features) #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan)) # if 1 then it is a period, if 0 then it is an interval if (my_class == 1): my_entity = chrono.ChronoPeriodEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, period_type=getPeriodValue(val), number=None) chrono_id = chrono_id + 1 else: my_entity = chrono.ChronoCalendarIntervalEntity( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, calendar_type=val) chrono_id = chrono_id + 1 #Extract the number and identify the span of numstr if idxstart > 0: ## get the absolute span of the interval token abs_Sspan = ref_Sspan + idxstart abs_Espan = ref_Sspan + idxend ## purposfully split on a single space substr = s.getText()[0:idxstart] # test to see if last character is a space and set a flag. has_space = True if substr[len(substr) - 1] == ' ' else False substr = substr.strip(' ').split(' ') ## get the previous token prevtok = substr[len(substr) - 1] prev_sSpan = idxstart - len( prevtok) - 1 if has_space else idxstart - len(prevtok) prev_eSpan = idxstart - 1 ## get the rest of the substring joined by a space if len(substr) > 1: rest_of_phrase = ' '.join(substr[0:len(substr) - 1]) rest_of_phrase_length = len(rest_of_phrase) + 1 else: rest_of_phrase_length = 0 ## now calculate the relative span of prevtok #rel_Sspan = rest_of_phrase_length #rel_Espan = rest_of_phrase_length + len(prevtok) m = re.search('([0-9]{1,2})', prevtok) if m is not None: num_val = m.group(0) abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span( 0)[0] abs_Espan = ref_Sspan + rest_of_phrase_length + m.span( 0)[1] my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=num_val) chrono_id = chrono_id + 1 #add the number entity to the list chrono_list.append(my_number_entity) my_entity.set_number(my_number_entity.get_id()) #else search for a text number else: texNumVal = utils.getNumberFromText(prevtok) if texNumVal is not None: abs_Sspan = ref_Sspan + rest_of_phrase_length abs_Espan = ref_Sspan + rest_of_phrase_length + len( prevtok ) if has_space else ref_Sspan + rest_of_phrase_length + len( prevtok) - 1 #create the number entity my_number_entity = chrono.ChronoNumber( entityID=str(chrono_id) + "entity", start_span=abs_Sspan, end_span=abs_Espan, value=texNumVal) chrono_id = chrono_id + 1 #append to list chrono_list.append(my_number_entity) #link to interval entity my_entity.set_number(my_number_entity.get_id()) chrono_list.append(my_entity) return chrono_list, chrono_id