Пример #1
0
def buildMonthOfYear(s, chrono_id, chrono_list, flags):
    b, text, startSpan, endSpan = hasMonthOfYear(s)
    if b and not flags["month"]:
        flags["month"] = True
        ref_StartSpan, ref_EndSpan = s.getSpan()
        abs_StartSpan = ref_StartSpan + startSpan
        abs_EndSpan = abs_StartSpan + abs(endSpan - startSpan)
        if (int(text) <= 12):
            chrono_entity = chrono.chronoMonthOfYearEntity(
                entityID=str(chrono_id) + "entity",
                start_span=abs_StartSpan,
                end_span=abs_EndSpan,
                month_type=calendar.month_name[utils.getMonthNumber(text)])
            chrono_list.append(chrono_entity)
            chrono_id = chrono_id + 1

    return chrono_list, chrono_id, flags
Пример #2
0
def buildNumericDate(s, chrono_id, chrono_list, flags):
    # convert to all lower
    text_lower = s.getText().lower()
    # remove all punctuation
    # text_norm = text_lower.translate(str.maketrans("", "", string.punctuation))
    # print("After:" + text_norm)
    # convert to list
    text_norm = text_lower.strip(".,")
    text_list = text_norm.split(" ")

    for text in text_list:
        ## See if there is a 4 digit number and assume it is a year if between 1500 and 2050
        ## Note that 24hour times in this range will be interpreted as years.  However, if a timezone like 1800EDT is attached it will not be parsed here.
        if len(text) == 4:

            num = utils.getNumberFromText(text)
            if num is not None:
                if (num >= 1500) and (num <= 2050) and not flags[
                        "fourdigityear"] and not flags["loneDigitYear"]:
                    flags["loneDigitYear"] = True
                    # print("Found Lone Digit Year")
                    ## build year
                    ref_StartSpan, ref_EndSpan = s.getSpan()
                    start_idx, end_idx = re.search(text, s.getText()).span(0)

                    chrono_year_entity = chrono.ChronoYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + start_idx,
                        end_span=ref_StartSpan + end_idx,
                        value=num)
                    chrono_id = chrono_id + 1
                    chrono_list.append(chrono_year_entity)

        ## parse out the condesnsed date format like 19980303 or 03031998.
        elif len(text) == 8 and utils.getNumberFromText(text) is not None:
            # Identify format yyyymmdd
            y = utils.getNumberFromText(text[0:4])
            m = utils.getNumberFromText(text[4:6])
            d = utils.getNumberFromText(text[6:8])
            if y is not None:
                if (y >= 1500) and (y <= 2050) and (m <= 12) and (d <= 31):
                    ref_StartSpan, ref_EndSpan = s.getSpan()
                    # add year

                    chrono_year_entity = chrono.ChronoYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan,
                        end_span=ref_StartSpan + 4,
                        value=y)
                    chrono_id = chrono_id + 1
                    # add month
                    chrono_month_entity = chrono.chronoMonthOfYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + 4,
                        end_span=ref_StartSpan + 6,
                        month_type=calendar.month_name[m])
                    chrono_id = chrono_id + 1
                    chrono_year_entity.set_sub_interval(
                        chrono_month_entity.get_id())
                    # add day
                    chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + 6,
                        end_span=ref_StartSpan + 8,
                        value=d)
                    chrono_id = chrono_id + 1
                    chrono_month_entity.set_sub_interval(
                        chrono_day_entity.get_id())

                    chrono_list.append(chrono_year_entity)
                    chrono_list.append(chrono_month_entity)
                    chrono_list.append(chrono_day_entity)
                else:
                    # test for mmddyyyy
                    y2 = utils.getNumberFromText(text[4:8])
                    m2 = utils.getNumberFromText(text[0:2])
                    d2 = utils.getNumberFromText(text[2:4])
                    if y2 is not None:
                        if (y2 >= 1500) and (y2 <= 2050) and (m2 <= 12) and (
                                d2 <= 31):
                            ref_StartSpan, ref_EndSpan = s.getSpan()
                            # add year

                            chrono_year_entity = chrono.ChronoYearEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan + 4,
                                end_span=ref_StartSpan + 8,
                                value=y)
                            chrono_id = chrono_id + 1
                            # add month
                            chrono_month_entity = chrono.chronoMonthOfYearEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan,
                                end_span=ref_StartSpan + 2,
                                month_type=calendar.month_name[m2])
                            chrono_id = chrono_id + 1
                            chrono_year_entity.set_sub_interval(
                                chrono_month_entity.get_id())
                            # add day
                            chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan + 2,
                                end_span=ref_StartSpan + 4,
                                value=d)
                            chrono_id = chrono_id + 1
                            chrono_month_entity.set_sub_interval(
                                chrono_day_entity.get_id())

                            chrono_list.append(chrono_year_entity)
                            chrono_list.append(chrono_month_entity)
                            chrono_list.append(chrono_day_entity)

        ## parse out the condesnsed date format like 030399 or 990303.
        ## Note that dates such as 12-01-2006 (120106 vs 061201) and similar are not distinguishable.
        elif len(text) == 6 and utils.getNumberFromText(text) is not None:
            # Identify format mmddyy

            y = utils.getNumberFromText(text[4:6])
            m = utils.getNumberFromText(text[0:2])
            d = utils.getNumberFromText(text[2:4])
            if y is not None and m is not None and d is not None:
                if (m <= 12) and (d <= 31):
                    ref_StartSpan, ref_EndSpan = s.getSpan()
                    # add year
                    chrono_year_entity = chrono.ChronoTwoDigitYearOperator(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + 4,
                        end_span=ref_StartSpan + 6,
                        value=y)
                    chrono_id = chrono_id + 1
                    # add month
                    chrono_month_entity = chrono.chronoMonthOfYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan,
                        end_span=ref_StartSpan + 2,
                        month_type=calendar.month_name[m])
                    chrono_id = chrono_id + 1
                    chrono_year_entity.set_sub_interval(
                        chrono_month_entity.get_id())
                    # add day
                    chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=ref_StartSpan + 2,
                        end_span=ref_StartSpan + 4,
                        value=d)
                    chrono_id = chrono_id + 1
                    chrono_month_entity.set_sub_interval(
                        chrono_day_entity.get_id())

                    chrono_list.append(chrono_year_entity)
                    chrono_list.append(chrono_month_entity)
                    chrono_list.append(chrono_day_entity)
                else:
                    # test for yymmdd
                    y2 = utils.getNumberFromText(text[0:2])
                    m2 = utils.getNumberFromText(text[2:4])
                    d2 = utils.getNumberFromText(text[4:6])
                    if y2 is not None:
                        if (m2 <= 12) and (d2 <= 31):
                            ref_StartSpan, ref_EndSpan = s.getSpan()
                            # add year
                            chrono_year_entity = chrono.ChronoTwoDigitYearOperator(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan,
                                end_span=ref_StartSpan + 2,
                                value=y2)
                            chrono_id = chrono_id + 1
                            # add month
                            chrono_month_entity = chrono.chronoMonthOfYearEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan + 2,
                                end_span=ref_StartSpan + 4,
                                month_type=calendar.month_name[m2])
                            chrono_id = chrono_id + 1
                            chrono_year_entity.set_sub_interval(
                                chrono_month_entity.get_id())
                            # add day
                            chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=ref_StartSpan + 4,
                                end_span=ref_StartSpan + 6,
                                value=d2)
                            chrono_id = chrono_id + 1
                            chrono_month_entity.set_sub_interval(
                                chrono_day_entity.get_id())

                            chrono_list.append(chrono_year_entity)
                            chrono_list.append(chrono_month_entity)
                            chrono_list.append(chrono_day_entity)

    return chrono_list, chrono_id, flags


####
# END_MODULE
####
Пример #3
0
def build2DigitYear(s, chrono_id, chrono_list, flags):
    b, text, startSpan, endSpan = has2DigitYear(s)
    if b and not flags["fourdigityear"]:
        # In most cases this will be at the end of the Span
        ref_StartSpan, ref_EndSpan = s.getSpan()
        abs_StartSpan = ref_StartSpan + startSpan
        abs_EndSpan = abs_StartSpan + abs(endSpan - startSpan)
        chrono_2_digit_year_entity = chrono.ChronoTwoDigitYearOperator(
            entityID=str(chrono_id) + "entity",
            start_span=abs_StartSpan,
            end_span=abs_EndSpan,
            value=text)
        chrono_id = chrono_id + 1

        # Check for Month in same element
        bMonth, textMonth, startSpanMonth, endSpanMonth = hasMonthOfYear(s)
        if bMonth and not flags["month"]:
            flags["month"] = True
            abs_StartSpanMonth = ref_StartSpan + startSpanMonth
            abs_EndSpanMonth = abs_StartSpanMonth + abs(endSpanMonth -
                                                        startSpanMonth)
            m = utils.getMonthNumber(textMonth)

            if (m <= 12):
                chrono_month_entity = chrono.chronoMonthOfYearEntity(
                    entityID=str(chrono_id) + "entity",
                    start_span=abs_StartSpanMonth,
                    end_span=abs_EndSpanMonth,
                    month_type=calendar.month_name[m])
                chrono_id = chrono_id + 1
                chrono_2_digit_year_entity.set_sub_interval(
                    chrono_month_entity.get_id())

            # Check for Day in same element

            bDay, textDay, startSpanDay, endSpanDay = hasDayOfMonth(s)
            if bDay and not flags["day"]:
                flags["day"] = True
                abs_StartSpanDay = ref_StartSpan + startSpanDay
                abs_EndSpanDay = abs_StartSpanDay + abs(endSpanDay -
                                                        startSpanDay)
                if (int(textDay) <= 31):
                    chrono_day_entity = chrono.ChronoDayOfMonthEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_StartSpanDay,
                        end_span=abs_EndSpanDay,
                        value=int(textDay))
                    chrono_id = chrono_id + 1
                    chrono_month_entity.set_sub_interval(
                        chrono_day_entity.get_id())

                # Check for Hour in same element
                bHour, textHour, startSpanHour, endSpanHour = hasHourOfDay(s)
                if bHour and not flags["hour"]:

                    flags["hour"] = True
                    ref_StartSpan, ref_EndSpan = s.getSpan()
                    abs_StartSpanHour = ref_StartSpan + startSpanHour
                    abs_EndSpanHour = abs_StartSpanHour + abs(endSpanHour -
                                                              startSpanHour)
                    if (int(textHour) <= 24):
                        chrono_hour_entity = chrono.ChronoHourOfDayEntity(
                            entityID=str(chrono_id) + "entity",
                            start_span=abs_StartSpanHour,
                            end_span=abs_EndSpanHour,
                            value=int(textHour))
                        chrono_id = chrono_id + 1
                        chrono_day_entity.set_sub_interval(
                            chrono_hour_entity.get_id())

                    # Check for Minute in same element
                    bMinute, textMinute, startSpanMinute, endSpanMinute = hasMinuteOfHour(
                        s)
                    if bMinute and not flags["minute"]:
                        flags["minute"] = True
                        ref_StartSpan, ref_EndSpan = s.getSpan()
                        abs_StartSpanMinute = ref_StartSpan + startSpanMinute
                        abs_EndSpanMinute = abs_StartSpanMinute + abs(
                            endSpanMinute - startSpanMinute)
                        if (int(textMinute) <= 60):
                            chrono_minute_entity = chrono.ChronoMinuteOfHourEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=abs_StartSpanMinute,
                                end_span=abs_EndSpanMinute,
                                value=int(textMinute))
                            chrono_id = chrono_id + 1
                            chrono_hour_entity.set_sub_interval(
                                chrono_minute_entity.get_id())

                        # Check for Second in same element
                        bSecond, textSecond, startSpanSecond, endSpanSecond = hasSecondOfMinute(
                            s)
                        if bSecond and not flags["second"]:
                            flags["second"] = True
                            ref_StartSpan, ref_EndSpan = s.getSpan()
                            abs_StartSpanSecond = ref_StartSpan + startSpanSecond
                            abs_EndSpanSecond = abs_StartSpanSecond + abs(
                                endSpanSecond - startSpanSecond)
                            if (int(textSecond) <= 60):
                                chrono_second_entity = chrono.ChronoSecondOfMinuteEntity(
                                    entityID=str(chrono_id) + "entity",
                                    start_span=abs_StartSpanSecond,
                                    end_span=abs_EndSpanSecond,
                                    value=int(textSecond))
                                chrono_list.append(chrono_second_entity)
                                chrono_id = chrono_id + 1
                                chrono_minute_entity.set_sub_interval(
                                    chrono_second_entity.get_id())

                        chrono_list.append(chrono_minute_entity)

                    chrono_list.append(chrono_hour_entity)

                chrono_list.append(chrono_day_entity)

            chrono_list.append(chrono_month_entity)

        chrono_list.append(chrono_2_digit_year_entity)

    return chrono_list, chrono_id, flags
Пример #4
0
def buildTextMonthAndDay(s,
                         chrono_id,
                         chrono_list,
                         flags,
                         dct=None,
                         ref_list=None):
    boo, val, idxstart, idxend = hasTextMonth(s, ref_list)
    if boo and not flags["month"]:
        flags["month"] = True
        ref_Sspan, ref_Espan = s.getSpan()
        abs_Sspan = ref_Sspan + idxstart
        abs_Espan = ref_Sspan + idxend
        my_month_entity = chrono.chronoMonthOfYearEntity(
            entityID=str(chrono_id) + "entity",
            start_span=abs_Sspan,
            end_span=abs_Espan,
            month_type=val)
        chrono_id = chrono_id + 1

        ## assume all numbers 1-31 are days
        ## assume all numbers >1000 are years
        ## parse all text before month
        ## test to see if all text is a number or text year
        ## if no:
        ## remove all punctuation
        ## seperate by spaces
        ## parse each token, if find a number then assign to day or year as appropriate
        ## if yes:
        ## assign to day or year as appropriate

        ## parse all text after month
        ## test to see if all text is a number or text year
        ## if no:
        ## remove all punctuation
        ## seperate by spaces
        ## parse each token, if find a number then assign to day or year as appropriate
        ## if yes:
        ## assign to day or year as appropriate

        #idx_end is the last index of the month.  If there are any characters after it the length of the string will be greater than the endidx.
        if (idxend < len(s.getText())):
            substr = s.getText()[idxend:].strip(",.").strip()

            num = utils.getNumberFromText(substr)
            if num is not None:
                if num <= 31 and not flags["day"]:
                    flags["day"] = True
                    day_startidx, day_endidx = calculateSpan(
                        s.getText(), str(num))  #substr)
                    abs_Sspan = ref_Sspan + day_startidx
                    abs_Espan = ref_Sspan + day_endidx
                    my_day_entity = chrono.ChronoDayOfMonthEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        value=num)
                    chrono_list.append(my_day_entity)
                    chrono_id = chrono_id + 1

                    #now figure out if it is a NEXT or LAST
                    #create doctime
                    if False:  #dct is not None:
                        mStart = my_month_entity.get_start_span()
                        mEnd = my_month_entity.get_end_span()
                        this_dct = datetime.datetime(
                            int(dct.year),
                            int(
                                utils.getMonthNumber(
                                    my_month_entity.get_month_type())),
                            int(my_day_entity.get_value()), 0, 0)
                        if this_dct > dct:
                            chrono_list.append(
                                chrono.ChronoNextOperator(
                                    entityID=str(chrono_id) + "entity",
                                    start_span=mStart,
                                    end_span=mEnd,
                                    repeating_interval=my_month_entity.get_id(
                                    )))
                            chrono_id = chrono_id + 1
                        elif this_dct < dct:
                            chrono_list.append(
                                chrono.ChronoLastOperator(
                                    entityID=str(chrono_id) + "entity",
                                    start_span=mStart,
                                    end_span=mEnd,
                                    repeating_interval=my_month_entity.get_id(
                                    )))
                            chrono_id = chrono_id + 1
                elif num >= 1500 and num <= 2050 and not flags[
                        "fourdigityear"] and not flags["loneDigitYear"]:
                    flags["fourdigityear"] = True
                    year_startidx, year_endidx = calculateSpan(
                        s.getText(), substr)
                    abs_Sspan = ref_Sspan + year_startidx
                    abs_Espan = ref_Sspan + year_endidx

                    my_year_entity = chrono.ChronoYearEntity(
                        entityID=str(chrono_id) + "entity",
                        start_span=abs_Sspan,
                        end_span=abs_Espan,
                        value=num)
                    chrono_list.append(my_year_entity)
                    my_year_entity.set_sub_interval(my_month_entity.get_id())
                    chrono_id = chrono_id + 1
            else:
                ##parse and process each token
                ##replace punctuation
                substr = substr.translate(
                    str.maketrans(string.punctuation,
                                  ' ' * len(string.punctuation)))
                ##split on spaces
                tokenized_text = WhitespaceTokenizer().tokenize(substr)
                for i in range(0, len(tokenized_text)):
                    num = utils.getNumberFromText(tokenized_text[i])
                    if num is not None:
                        if num <= 31:
                            day_startidx, day_endidx = calculateSpan(
                                s.getText(), tokenized_text[i])
                            abs_Sspan = ref_Sspan + day_startidx
                            abs_Espan = ref_Sspan + day_endidx
                            my_day_entity = chrono.ChronoDayOfMonthEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=abs_Sspan,
                                end_span=abs_Espan,
                                value=num)
                            chrono_list.append(my_day_entity)
                            chrono_id = chrono_id + 1

                            #now figure out if it is a NEXT or LAST
                            #create doctime
                            if False:  #dct is not None:
                                mStart = my_month_entity.get_start_span()
                                mEnd = my_month_entity.get_end_span()
                                this_dct = datetime.datetime(
                                    int(dct.year),
                                    int(
                                        utils.getMonthNumber(
                                            my_month_entity.get_month_type())),
                                    int(my_day_entity.get_value()), 0, 0)
                                if this_dct > dct:
                                    chrono_list.append(
                                        chrono.ChronoNextOperator(
                                            entityID=str(chrono_id) + "entity",
                                            start_span=mStart,
                                            end_span=mEnd,
                                            repeating_interval=my_month_entity.
                                            get_id()))
                                    chrono_id = chrono_id + 1
                                elif this_dct < dct:
                                    chrono_list.append(
                                        chrono.ChronoLastOperator(
                                            entityID=str(chrono_id) + "entity",
                                            start_span=mStart,
                                            end_span=mEnd,
                                            repeating_interval=my_month_entity.
                                            get_id()))
                                    chrono_id = chrono_id + 1
                        elif num >= 1500 and num <= 2050 and not flags[
                                "fourdigityear"] and not flags["loneDigitYear"]:
                            flags["fourdigityear"] = True
                            year_startidx, year_endidx = calculateSpan(
                                s.getText(), tokenized_text[i])
                            abs_Sspan = ref_Sspan + year_startidx
                            abs_Espan = ref_Sspan + year_endidx

                            my_year_entity = chrono.ChronoYearEntity(
                                entityID=str(chrono_id) + "entity",
                                start_span=abs_Sspan,
                                end_span=abs_Espan,
                                value=num)
                            chrono_list.append(my_year_entity)
                            my_year_entity.set_sub_interval(
                                my_month_entity.get_id())
                            chrono_id = chrono_id + 1

        ## if the start of the month is not 0 then we have leading text to parse
        if (idxstart > 0):
            #substr = s.getText()[:idxstart].strip(",.").strip()
            hasMod, mod_type, mod_start, mod_end = hasModifier(s)
            if (hasMod):
                if mod_type == "This":
                    chrono_list.append(
                        chrono.ChronoThisOperator(
                            entityID=str(chrono_id) + "entity",
                            start_span=ref_Sspan + mod_start,
                            end_span=ref_Sspan + mod_end,
                            repeating_interval=my_month_entity.get_id()))
                    chrono_id = chrono_id + 1

                if mod_type == "Next":
                    chrono_list.append(
                        chrono.ChronoNextOperator(
                            entityID=str(chrono_id) + "entity",
                            start_span=ref_Sspan + mod_start,
                            end_span=ref_Sspan + mod_end,
                            repeating_interval=my_month_entity.get_id()))
                    chrono_id = chrono_id + 1

                if mod_type == "Last":
                    # print("FOUND LAST")
                    chrono_list.append(
                        chrono.ChronoLastOperator(
                            entityID=str(chrono_id) + "entity",
                            start_span=ref_Sspan + mod_start,
                            end_span=ref_Sspan + mod_end,
                            repeating_interval=my_month_entity.get_id(),
                            semantics="Interval-Not-Included"))
                    chrono_id = chrono_id + 1

        chrono_list.append(my_month_entity)

    return chrono_list, chrono_id, flags