Exemplo n.º 1
0
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier,
                        feats):

    features = feats.copy()
    ref_Sspan, ref_Espan = s.getSpan()
    #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText())
    boo, val, idxstart, idxend, plural = hasPeriodInterval(s)

    # FIND terms that are always marked as calendar intervals!
    if boo and re.search(
            "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week",
            s.getText()):
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoCalendarIntervalEntity(
            entityID=str(chrono_id) + "entity",
            start_span=abs_Sspan,
            end_span=abs_Espan,
            calendar_type=val,
            number=None)
        chrono_id = chrono_id + 1

        if re.search("yesterday|yesterdays", s.getText()):

            my_last_entity = chrono.ChronoLastOperator(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                repeating_interval=str(chrono_id - 1) + "entity")
            chrono_id = chrono_id + 1
            chrono_list.append(my_last_entity)

        chrono_list.append(my_entity)

    # FIND terms that are always marked as periods!
    elif boo and val == "Unknown":
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) +
                                              "entity",
                                              start_span=abs_Sspan,
                                              end_span=abs_Espan,
                                              period_type=val,
                                              number=None)
        chrono_id = chrono_id + 1
        chrono_list.append(my_entity)

    elif boo:
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend

        # get index of overlapping reference token
        #ref_idx = -1
        #for i in range(0,len(ref_list)):
        #    if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))):
        #        ref_idx = i
        #        break

        ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan)

        # extract ML features
        my_features = utils.extract_prediction_features(
            ref_list, ref_idx, feats.copy())

        # classify into period or interval
        if classifier[1] == "NN":
            my_class = ChronoKeras.keras_classify(
                classifier[0], np.array(list(my_features.values())))
            #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
        elif classifier[1] in ("SVM", "RF"):
            feat_array = [int(i) for i in my_features.values()]
            my_class = classifier[0].predict([feat_array])[0]
        else:
            my_class = classifier[0].classify(my_features)
            #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

        # if 1 then it is a period, if 0 then it is an interval
        if my_class == 1:
            my_entity = chrono.ChronoPeriodEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                period_type=getPeriodValue(val),
                number=None)
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_period(my_entity.get_id())
                chrono_list.append(chrono_this_entity)

            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasModifier(s)

                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        else:
            my_entity = chrono.ChronoCalendarIntervalEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                calendar_type=val,
                number=None)
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_repeating_interval(my_entity.get_id())
                chrono_list.append(chrono_this_entity)
            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasModifier(s)
                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        #check to see if it has a number associated with it.  We assume the number comes before the interval string
        if idxstart > 0:
            substr = s.getText()[0:idxstart]
            m = re.search('([0-9]{1,2})', substr)
            if m is not None:
                num_val = m.group(0)
                abs_Sspan = ref_Sspan + m.span(0)[0]
                abs_Espan = ref_Sspan + m.span(0)[1]

                my_number_entity = chrono.ChronoNumber(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=num_val)
                chrono_id = chrono_id + 1

                #add the number entity to the list
                chrono_list.append(my_number_entity)
                my_entity.set_number(my_number_entity.get_id())
            #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(substr)
                if texNumVal is not None:
                    #create the number entity
                    my_number_entity = chrono.ChronoNumber(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_Sspan,
                        end_span=ref_Sspan + (idxstart - 1),
                        value=texNumVal)
                    chrono_id = chrono_id + 1
                    #append to list
                    chrono_list.append(my_number_entity)
                    #link to interval entity
                    my_entity.set_number(my_number_entity.get_id())

        chrono_list.append(my_entity)

    else:
        boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s)
        if (boo2):
            abs_Sspan = ref_Sspan + idxstart
            abs_Espan = ref_Sspan + idxend

            # get index of overlapping reference token
            ref_idx = -1
            for i in range(0, len(ref_list)):
                if (utils.overlap(ref_list[i].getSpan(),
                                  (abs_Sspan, abs_Espan))):
                    ref_idx = i
                    break

            # extract ML features
            my_features = utils.extract_prediction_features(
                ref_list, ref_idx, features)

            # classify into period or interval
            if (classifier[1] == "NN"):
                my_class = ChronoKeras.keras_classify(
                    classifier[0], np.array(list(my_features.values())))
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
            else:
                my_class = classifier[0].classify(my_features)
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

            # if 1 then it is a period, if 0 then it is an interval
            if (my_class == 1):
                my_entity = chrono.ChronoPeriodEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    period_type=getPeriodValue(val),
                    number=None)
                chrono_id = chrono_id + 1
            else:
                my_entity = chrono.ChronoCalendarIntervalEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    calendar_type=val)
                chrono_id = chrono_id + 1

            #Extract the number and identify the span of numstr

            substr = s.getText(
            )[:idxstart]  ## extract entire first part of TimePhrase phrase
            m = re.search(
                '([0-9]{1,2})', substr
            )  #search for an integer in the subphrase and extract it's coordinates
            if m is not None:
                num_val = m.group(0)
                abs_Sspan = ref_Sspan + m.span(0)[0]
                abs_Espan = ref_Sspan + m.span(0)[1]

                my_number_entity = chrono.ChronoNumber(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=num_val)
                chrono_id = chrono_id + 1

                #add the number entity to the list
                chrono_list.append(my_number_entity)
                #link to interval entity
                my_entity.set_number(my_number_entity.get_id())
            #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(numstr)
                if texNumVal is not None:
                    m = re.search(
                        numstr,
                        substr)  #search for the number string in the subphrase
                    if m is not None:
                        abs_Sspan = ref_Sspan + m.span(0)[0]
                        abs_Espan = ref_Sspan + m.span(0)[1]
                        #create the number entity
                        my_number_entity = chrono.ChronoNumber(
                            entityID=str(chrono_id) + "entity",
                            start_span=abs_Sspan,
                            end_span=abs_Espan,
                            value=texNumVal)
                        chrono_id = chrono_id + 1
                        #append to list
                        chrono_list.append(my_number_entity)
                        #link to interval entity
                        my_entity.set_number(my_number_entity.get_id())

            chrono_list.append(my_entity)

    return chrono_list, chrono_id
def createMLTrainingMatrix(infiles,
                           gold_folder,
                           ext="",
                           save=False,
                           output="aquaint_train",
                           window=3):
    ### Algorithm
    ## For each input file:
    ##      1) parse text to refTokens list
    ##      2) parse SUTime to identify temporal tokens
    ##      3) Import gold standard file
    ##      4) Get list of periods and intervals with start and end coords
    ##      5) For each period/interval:
    ##          - Create feature vector
    ##          - Save features to global list
    ##      6) Write gold features to a csv file for import by other scripts to train ML methods.

    ## define list of dictionary feature vectors
    obs_list = []  ### This is the list of features for each observation
    category = [
    ]  ### This is the category of the observation.  1 for period, 0 otherwise. Note that the unknowns are being grouped in with the calendar-interval category.  probably need to parse that out later or change up the algorithm to not be a binary classifier.

    features = {
        'feat_numeric': 0,
        'feat_temp_context': 0,
        'feat_temp_self': 0
    }  ### This is the full list of features.  I will use the key values to get the individual feature vectors.

    if (save):
        outfile = open("./gold-standard-parsing.txt", 'w')

    ## Loop through each file and parse
    for f in range(0, len(infiles)):
        print("ML Parsing " + infiles[f] + " ...")

        ## parse out the doctime
        doctime = utils.getDocTime(infiles[f] + ".dct")
        if (debug): print(doctime)

        ## parse out reference tokens
        text, tokens, spans, tags = utils.getWhitespaceTokens(infiles[f] + ext)
        my_refToks = referenceToken.convertToRefTokens(tok_list=tokens,
                                                       span=spans,
                                                       pos=tags)

        ## mark all ref tokens if they are numeric or temporal
        chroList = utils.markTemporal(my_refToks)

        ## import gold standard data
        gold_file = os.path.join(gold_folder,
                                 os.path.split(infiles[f])[1],
                                 "period-interval.gold.csv")
        gold_list = []

        if not os.path.exists(gold_file):
            print(gold_file + " DOES NOT EXISTS")
            break

        if os.path.exists(gold_file):
            if (save):
                outfile.write("\n$$$$$$$$$$$\nProcessing: " + gold_file)
            with open(gold_file) as file:
                reader = csv.DictReader(file)
                for row in reader:
                    gold_list.append({
                        'type': row['type'],
                        'start': row['start'],
                        'end': row['end'],
                        'value': row['value']
                    })
                    if (save):
                        outfile.write("\n" + str(row))

            ## loop through each reftoken term and see if it overlaps with a gold token
            for r in range(0, len(chroList)):
                reftok = chroList[r]
                ref_s, ref_e = reftok.getSpan()
                # loop through each gold instance and find the one that overlaps with the current reftok.
                for g in gold_list:
                    # print(str(g))
                    if utils.overlap(
                        [ref_s, ref_e],
                        [int(g['start']), int(g['end'])]):
                        this_obs = {}
                        # if the gold token overlaps with the current reftok we need to extract the features from the reftok and add it to the list

                        if (save):
                            outfile.write("\nPrevious Token: " +
                                          str(chroList[max(r - 1, 0)]))
                            outfile.write("\nTarget Token: " + str(reftok))
                            #print("Length: "+ str(len(my_refToks)) + "Last: "+str(min(r+1, len(my_refToks))))
                            outfile.write(
                                "\nNext Token: " +
                                str(chroList[min(r + 1,
                                                 len(my_refToks) - 1)]) + "\n")

                        ### Identify Temporal features
                        this_obs = extract_temp_features(
                            chroList, r, 3, this_obs)

                        ### Extract all words within a N-word window
                        this_obs, observations = extract_bow_features(
                            chroList, r, window, features, this_obs)

                        ### Determine if there is a numeric before or after the target word.
                        this_obs = extract_numeric_feature(
                            chroList, r, this_obs)

                        ### Stem and extract the actual word
                        this_obs, observations = extract_stem_feature(
                            chroList[r], features, this_obs)

                        ### Get the correct type
                        if (g['type'] == 'Period'):
                            category.append(1)
                        else:
                            category.append(0)

                        obs_list.append(this_obs)

    ## Ok, I have all the features.  Now I just need to put them all together in a matrix.
    print("features length: " + str(len(features.keys())))
    print("obs_list length: " + str(len(obs_list)))
    print("category length: " + str(len(category)))

    ## Now I need to loop through the obs_list to create a list of features that contain all feature elements.
    full_obs_list = []  # a list of tuples
    for i in range(0, len(obs_list)):
        feats = deepcopy(features)
        feats.update(obs_list[i])
        #full_obs_list.append((feats, category[i]))
        full_obs_list.append(feats)

    ## Now print the list of tuples to a file, then return the list.
    keys = full_obs_list[0].keys()
    with open(output + '_data.csv', 'w') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(full_obs_list)

    with open(output + '_class.csv', 'w') as output_file:
        for c in category:
            output_file.write("%s\n" % c)

    ### Now return the feature list and the categories
    return (full_obs_list, category)
Exemplo n.º 3
0
def buildDoseDuration(s, chrono_id, chrono_list, ref_list, classifier, feats):
    features = feats.copy()
    ref_Sspan, ref_Espan = s.getSpan()
    #print("In buildPeriodInterval(), TimePhrase Text: " + s.getText())
    bad = re.compile(r"^q\d|^Q\d")
    parts = s.getText().split()
    containsnum = False

    #various checks to ensure that this phrase is actually a dose duration

    if isDoseDuration(parts[0]):
        return chrono_list, chrono_id
    if "every" in s.getText().lower() or "time" in s.getText().lower(
    ) or "per" in s.getText().lower():
        return chrono_list, chrono_id
    if bad.match(s.getText()):
        return chrono_list, chrono_id
    if "/" in s.getText():
        return chrono_list, chrono_id
    if "[**" in s.getText() or "**]" in s.getText():
        return chrono_list, chrono_id
    if "ly" in s.getText():
        return chrono_list, chrono_id
    if "(" in s.getText() or ")" in s.getText():
        return chrono_list, chrono_id
    if "once" in s.getText().lower() or "twice" in s.getText().lower():
        return chrono_list, chrono_id
    if "past" in s.getText().lower() or "ago" in s.getText().lower():
        return chrono_list, chrono_id
    if "RANDOM" in s.getText():
        return chrono_list, chrono_id
    for part in parts:
        part = re.sub('[' + string.punctuation + ']', '', part).strip()
        for ref in ref_list:
            if ref.getText().lower() == part.lower():
                if (ref.isNumeric()):
                    containsnum = True
                    if utils.isOrdinal(ref.getText()):
                        return chrono_list, chrono_id
                    break

                elif not tt.hasDoseDuration(ref.getText().lower()):
                    return chrono_list, chrono_id
    if containsnum == False:
        return chrono_list, chrono_id

    boo, val, idxstart, idxend, plural = hasDoseDuration(s)
    if boo:
        abs_Sspan = ref_Sspan
        abs_Espan = ref_Espan

        # get index of overlapping reference token
        # ref_idx = -1
        # for i in range(0,len(ref_list)):
        #    if(utils.overlap(ref_list[i].getSpan(),(abs_Sspan,abs_Espan))):
        #        ref_idx = i
        #        break

        ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan)

        # extract ML features
        my_features = utils.extract_prediction_features(
            ref_list, ref_idx, feats.copy())

        # classify into period or interval
        if classifier[1] == "NN":
            my_class = ChronoKeras.keras_classify(
                classifier[0], np.array(list(my_features.values())))
            # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
        elif classifier[1] in ("SVM", "RF"):
            feat_array = [int(i) for i in my_features.values()]
            my_class = classifier[0].predict([feat_array])[0]
        else:
            my_class = classifier[0].classify(my_features)
            # print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

        # if 1 then it is a period, if 0 then it is an interval
        if my_class == 1:
            my_entity = chrono.ChronoDoseDurationEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                dose_type=getDoseDurationValue(val),
                number=None,
                text=s.getText())
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_period(my_entity.get_id())
                chrono_list.append(chrono_this_entity)

            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasModifier(s)

                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        else:
            my_entity = chrono.ChronoDoseDurationEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                dose_type=val,
                number=None,
                text=s.getText())
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_repeating_interval(my_entity.get_id())
                chrono_list.append(chrono_this_entity)
            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasModifier(s)
                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        # check to see if it has a number associated with it.  We assume the number comes before the interval string

        chrono_list.append(my_entity)
    else:
        boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s)
        if (boo2):
            abs_Sspan = ref_Sspan
            abs_Espan = ref_Espan

            # get index of overlapping reference token
            ref_idx = -1
            for i in range(0, len(ref_list)):
                if (utils.overlap(ref_list[i].getSpan(),
                                  (abs_Sspan, abs_Espan))):
                    ref_idx = i
                    break

            # extract ML features
            my_features = utils.extract_prediction_features(
                ref_list, ref_idx, features)

            # classify into period or interval
            if (classifier[1] == "NN"):
                my_class = ChronoKeras.keras_classify(
                    classifier[0], np.array(list(my_features.values())))
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
            else:
                my_class = classifier[0].classify(my_features)
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

            # if 1 then it is a period, if 0 then it is an interval
            if (my_class == 1):
                my_entity = chrono.ChronoDoseDurationEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    dose_type=getDoseDurationValue(val),
                    number=None,
                    text=s.getText())
                chrono_id = chrono_id + 1
            else:
                my_entity = chrono.ChronoDoseDurationEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    dose_type=val,
                    number=None,
                    text=s.getText())
                chrono_id = chrono_id + 1

            #Extract the number and identify the span of numstr

            substr = s.getText(
            )[:idxstart]  ## extract entire first part of TimePhrase phrase
            m = re.search(
                '([0-9]{1,2})', substr
            )  #search for an integer in the subphrase and extract it's coordinates

            chrono_list.append(my_entity)

    return chrono_list, chrono_id
Exemplo n.º 4
0
def buildPeriodInterval(s, chrono_id, chrono_list, ref_list, classifier,
                        feats):

    features = feats.copy()
    ref_Sspan, ref_Espan = s.getSpan()
    boo, val, idxstart, idxend, plural = hasPeriodInterval(s)

    # FIND terms that are always marked as calendar intervals!
    if boo and re.search(
            "yesterday|yesterdays|tomorrow|tomorrows|today|todays|daily|/min|/week",
            s.getText()):
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoCalendarIntervalEntity(
            entityID=str(chrono_id) + "entity",
            start_span=abs_Sspan,
            end_span=abs_Espan,
            calendar_type=val,
            number=None)
        chrono_id = chrono_id + 1

        if re.search("yesterday|yesterdays", s.getText()):

            my_last_entity = chrono.ChronoLastOperator(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                repeating_interval=str(chrono_id - 1) + "entity")
            chrono_id = chrono_id + 1
            chrono_list.append(my_last_entity)

        chrono_list.append(my_entity)

    # FIND terms that are always marked as periods!
    elif boo and val == "Unknown":
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_entity = chrono.ChronoPeriodEntity(entityID=str(chrono_id) +
                                              "entity",
                                              start_span=abs_Sspan,
                                              end_span=abs_Espan,
                                              period_type=val,
                                              number=None)
        chrono_id = chrono_id + 1
        chrono_list.append(my_entity)

    elif boo:
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend

        # get index of overlapping reference token
        ref_idx = utils.getRefIdx(ref_list, abs_Sspan, abs_Espan)

        # extract ML features
        my_features = utils.extract_prediction_features(
            ref_list, ref_idx, feats.copy())

        # classify into period or interval
        if classifier[1] == "NN":
            my_class = ChronoKeras.keras_classify(
                classifier[0], np.array(list(my_features.values())))
            #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
        elif classifier[1] in ("SVM", "RF"):
            feat_array = [int(i) for i in my_features.values()]
            my_class = classifier[0].predict([feat_array])[0]
        else:
            my_class = classifier[0].classify(my_features)
            #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

        # if 1 then it is a period, if 0 then it is an interval
        if my_class == 1:
            my_entity = chrono.ChronoPeriodEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                period_type=getPeriodValue(val),
                number=None)
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_period(my_entity.get_id())
                chrono_list.append(chrono_this_entity)

            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s)

                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                period=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        else:
            my_entity = chrono.ChronoCalendarIntervalEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_Sspan,
                end_span=abs_Espan,
                calendar_type=val,
                number=None)
            chrono_id = chrono_id + 1
            ### Check to see if this calendar interval has a "this" in front of it
            prior_tok = ref_list[ref_idx - 1].getText().lower()
            if prior_tok.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation))) == "this":
                # add a This entitiy and link it to the interval.
                start_span, end_span = re.search(prior_tok, "this").span(0)
                prior_start, prior_end = ref_list[ref_idx - 1].getSpan()

                chrono_this_entity = chrono.ChronoThisOperator(
                    entityID=str(chrono_id) + "entity",
                    start_span=prior_start + start_span,
                    end_span=prior_start + end_span)
                chrono_id = chrono_id + 1
                chrono_this_entity.set_repeating_interval(my_entity.get_id())
                chrono_list.append(chrono_this_entity)
            else:
                # check for a Last Word
                hasMod, mod_type, mod_start, mod_end = hasNextLastThis(s)
                if (hasMod):
                    if mod_type == "Next":
                        chrono_list.append(
                            chrono.ChronoNextOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id()))
                        chrono_id = chrono_id + 1

                    if mod_type == "Last":
                        chrono_list.append(
                            chrono.ChronoLastOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_Sspan + mod_start,
                                end_span=ref_Sspan + mod_end,
                                repeating_interval=my_entity.get_id(),
                                semantics="Interval-Not-Included"))
                        chrono_id = chrono_id + 1

        #check to see if it has a number associated with it.  We assume the number comes before the interval string
        #to figure out if the number we find is close to the interval token the end of the number string needs to be within 2 characters of the start of the interval token.
        #I tried just extracting the previous reference token, but that doesn't work because phrases like "42-year-old" are actually one reference token.
        # So I decided I had to do it the hard way with index arithmetic.  The one concern about this method is that I assume there is a space at the end.  This could cause some issues down the line.
        # Yep, we are getting the spans wrong for phrases like "six-months".  I am going to test for a space as the last character before just assuming there was one.
        if idxstart > 0:
            ## get the absolute span of the interval token
            abs_Sspan = ref_Sspan + idxstart
            abs_Espan = ref_Sspan + idxend

            ## purposfully split on a single space
            substr = s.getText()[0:idxstart]
            # test to see if last character is a space and set a flag.
            has_space = True if substr[len(substr) - 1] == ' ' else False
            substr = substr.strip(' ').split(' ')

            ## get the previous token
            prevtok = substr[len(substr) - 1]
            prev_sSpan = idxstart - len(
                prevtok) - 1 if has_space else idxstart - len(prevtok)
            prev_eSpan = idxstart - 1

            ## get the rest of the substring joined by a space
            if len(substr) > 1:
                rest_of_phrase = ' '.join(substr[0:len(substr) - 1])
                rest_of_phrase_length = len(rest_of_phrase) + 1

            else:
                rest_of_phrase_length = 0

            m = re.search('([0-9]{1,2})', prevtok)
            if m is not None:
                num_val = m.group(0)
                abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span(0)[0]
                abs_Espan = ref_Sspan + rest_of_phrase_length + m.span(0)[1]

                my_number_entity = chrono.ChronoNumber(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    value=num_val)
                chrono_id = chrono_id + 1

                #add the number entity to the list
                chrono_list.append(my_number_entity)
                my_entity.set_number(my_number_entity.get_id())
            #else search for a text number
            else:
                texNumVal = utils.getNumberFromText(prevtok)
                if texNumVal is not None:
                    abs_Sspan = ref_Sspan + rest_of_phrase_length
                    abs_Espan = ref_Sspan + rest_of_phrase_length + len(
                        prevtok
                    ) if has_space else ref_Sspan + rest_of_phrase_length + len(
                        prevtok) - 1

                    #create the number entity
                    my_number_entity = chrono.ChronoNumber(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        value=texNumVal)
                    chrono_id = chrono_id + 1
                    #append to list
                    chrono_list.append(my_number_entity)
                    #link to interval entity
                    my_entity.set_number(my_number_entity.get_id())

        chrono_list.append(my_entity)

    else:
        boo2, val, idxstart, idxend, numstr = hasEmbeddedPeriodInterval(s)
        if (boo2):
            abs_Sspan = ref_Sspan + idxstart
            abs_Espan = ref_Sspan + idxend

            # get index of overlapping reference token
            ref_idx = -1
            for i in range(0, len(ref_list)):
                if (utils.overlap(ref_list[i].getSpan(),
                                  (abs_Sspan, abs_Espan))):
                    ref_idx = i
                    break

            # extract ML features
            my_features = utils.extract_prediction_features(
                ref_list, ref_idx, features)

            # classify into period or interval
            if (classifier[1] == "NN"):
                my_class = ChronoKeras.keras_classify(
                    classifier[0], np.array(list(my_features.values())))
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))
            elif classifier[1] in ("SVM", "RF"):
                feat_array = [int(i) for i in my_features.values()]
                my_class = classifier[0].predict([feat_array])[0]
            else:
                my_class = classifier[0].classify(my_features)
                #print("Class: " + str(my_class) + " : Start: " + str(abs_Sspan) + " : End: "+ str(abs_Espan))

            # if 1 then it is a period, if 0 then it is an interval
            if (my_class == 1):
                my_entity = chrono.ChronoPeriodEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    period_type=getPeriodValue(val),
                    number=None)
                chrono_id = chrono_id + 1
            else:
                my_entity = chrono.ChronoCalendarIntervalEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_Sspan,
                    end_span=abs_Espan,
                    calendar_type=val)
                chrono_id = chrono_id + 1

            #Extract the number and identify the span of numstr
            if idxstart > 0:
                ## get the absolute span of the interval token
                abs_Sspan = ref_Sspan + idxstart
                abs_Espan = ref_Sspan + idxend

                ## purposfully split on a single space
                substr = s.getText()[0:idxstart]
                # test to see if last character is a space and set a flag.
                has_space = True if substr[len(substr) - 1] == ' ' else False
                substr = substr.strip(' ').split(' ')

                ## get the previous token
                prevtok = substr[len(substr) - 1]
                prev_sSpan = idxstart - len(
                    prevtok) - 1 if has_space else idxstart - len(prevtok)
                prev_eSpan = idxstart - 1

                ## get the rest of the substring joined by a space
                if len(substr) > 1:
                    rest_of_phrase = ' '.join(substr[0:len(substr) - 1])
                    rest_of_phrase_length = len(rest_of_phrase) + 1

                else:
                    rest_of_phrase_length = 0

                ## now calculate the relative span of prevtok
                #rel_Sspan = rest_of_phrase_length
                #rel_Espan = rest_of_phrase_length + len(prevtok)

                m = re.search('([0-9]{1,2})', prevtok)
                if m is not None:
                    num_val = m.group(0)
                    abs_Sspan = ref_Sspan + rest_of_phrase_length + m.span(
                        0)[0]
                    abs_Espan = ref_Sspan + rest_of_phrase_length + m.span(
                        0)[1]

                    my_number_entity = chrono.ChronoNumber(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        value=num_val)
                    chrono_id = chrono_id + 1

                    #add the number entity to the list
                    chrono_list.append(my_number_entity)
                    my_entity.set_number(my_number_entity.get_id())
                #else search for a text number
                else:
                    texNumVal = utils.getNumberFromText(prevtok)
                    if texNumVal is not None:
                        abs_Sspan = ref_Sspan + rest_of_phrase_length
                        abs_Espan = ref_Sspan + rest_of_phrase_length + len(
                            prevtok
                        ) if has_space else ref_Sspan + rest_of_phrase_length + len(
                            prevtok) - 1
                        #create the number entity
                        my_number_entity = chrono.ChronoNumber(
                            entityID=str(chrono_id) + "entity",
                            start_span=abs_Sspan,
                            end_span=abs_Espan,
                            value=texNumVal)
                        chrono_id = chrono_id + 1
                        #append to list
                        chrono_list.append(my_number_entity)
                        #link to interval entity
                        my_entity.set_number(my_number_entity.get_id())

            chrono_list.append(my_entity)

    return chrono_list, chrono_id