示例#1
0
def test_parser_private_warns():
    from dateutil.parser import _timelex, _tzparser
    from dateutil.parser import _parsetz

    with pytest.warns(DeprecationWarning):
        _tzparser()

    with pytest.warns(DeprecationWarning):
        _timelex('2014-03-03')

    with pytest.warns(DeprecationWarning):
        _parsetz('+05:00')
示例#2
0
def test_parser_private_warns():
    from dateutil.parser import _timelex, _tzparser
    from dateutil.parser import _parsetz

    with pytest.warns(DeprecationWarning):
        _tzparser()

    with pytest.warns(DeprecationWarning):
        _timelex('2014-03-03')

    with pytest.warns(DeprecationWarning):
        _parsetz('+05:00')
示例#3
0
def test_parser_parser_private_not_warns():
    from dateutil.parser._parser import _timelex, _tzparser
    from dateutil.parser._parser import _parsetz

    with warnings.catch_warnings():
        warnings.simplefilter("error")
        _tzparser()

    with warnings.catch_warnings():
        warnings.simplefilter("error")
        _timelex('2014-03-03')

    with warnings.catch_warnings():
        warnings.simplefilter("error")
        _parsetz('+05:00')
示例#4
0
def test_parser_parser_private_not_warns():
    from dateutil.parser._parser import _timelex, _tzparser
    from dateutil.parser._parser import _parsetz

    with pytest.warns(None) as recorder:
        _tzparser()
        assert len(recorder) == 0

    with pytest.warns(None) as recorder:
        _timelex('2014-03-03')

        assert len(recorder) == 0

    with pytest.warns(None) as recorder:
        _parsetz('+05:00')
        assert len(recorder) == 0
示例#5
0
    def weekday_reader(self, query):

        """
        This module converts weekdays to their respective dates
        Params:
            Input:
                query - str
            Output:
                query - str
        """

        wkday = r'\b(monday|mon|tuesday|tue|wednesday|wed|thursday|thu|friday|fri|saturday|sat|sunday|sun)\b'
        check_day = re.search(wkday, query, re.I)
        while check_day:
            start = datetime.datetime.today()
            this_day = start.weekday()            
            split_q = list(_timelex(query.lower()))
            split_q = [x for x in split_q if x != ' ']
            that_day = self.info.weekday(check_day.group(0))
            if that_day >= this_day:
                if len(split_q) > 1 and split_q[split_q.index(check_day.group(0)) - 1] == 'next':
                    diff = that_day - this_day + 7
                    diff = diff if diff < 7 + (6 - this_day) else diff - 7
                else:
                    diff = that_day - this_day
            else:
                diff = (6 - this_day) + (that_day + 1)
            repl = (start + relativedelta(days=diff))
            query = query.replace(check_day.group(0), repl.strftime("%d.%B.%Y"), 1)
            check_day = re.search(wkday, query, re.I)
        return query
示例#6
0
def test_parser_parser_private_not_warns():
    from dateutil.parser._parser import _timelex, _tzparser
    from dateutil.parser._parser import _parsetz

    with pytest.warns(None) as recorder:
        _tzparser()
        assert len(recorder) == 0

    with pytest.warns(None) as recorder:
        _timelex('2014-03-03')

        assert len(recorder) == 0

    with pytest.warns(None) as recorder:
        _parsetz('+05:00')
        assert len(recorder) == 0
示例#7
0
 def word2nummain(self, query):
     """
     This function splits te string based on certain words
     and calls the the other functions.
     Params:
         Input:
             query - str
         Output:
             output_query - str
     """
     try:
         query = self.preprocess(query)
         reg1 = regex.compile(
             r"""\b(month|year|january|jan|february|feb|march|mar|april|apr|may|
                           june|jun|july|jul|august|aug|september|sept|sep|october|oct|november|
                           nov|december|dec|for|to|th|nd|st|rd|of|
                           (?<!hundred\s|thousand\s)and)\b|(\.|:)""")
         reg1 = reg1.pattern.replace('\n', '').replace(' ', '')
         sent_list = [
             x for x in regex.split(reg1, query, flags=re.I)
             if isinstance(x, str)
         ]
         new_sent_list = self.convertword2num(sent_list)
         output_query = " ".join(new_sent_list)
         output_query = ' '.join(
             [x for x in list(_timelex(output_query)) if x != ' '])
         return output_query
     except Exception as exc:
         print("the error in main is>>>", traceback.format_exc(), exc)
示例#8
0
def timesplit(input_string):
    batch = []
    for token in _timelex(input_string):
        if timetoken(token):
            if info.jump(token):
                continue
            batch.append(token)
        else:
            if batch:
                yield " ".join(batch)
                batch = []
    if batch:
        yield " ".join(batch)
示例#9
0
def timesplit(input_string):
    batch = []
    for token in _timelex(input_string):
        if timetoken(token):
            if info.jump(token):
                continue
            batch.append(token)
        else:
            if batch:
                yield " ".join(batch)
                batch = []
    if batch:
        yield " ".join(batch)
def timesplit(input_string):
    """Helper method used by __extract_dates."""
    batch = []
    for token in _timelex(input_string):
        if token in ['to', 'and']:
            yield " ".join(batch)
            batch = []
            continue
        if timetoken(token):
            if dparser.parser().info.jump(token):
                continue
            batch.append(token)
        else:
            if batch:
                yield " ".join(batch)
                batch = []
    if batch:
        yield " ".join(batch)
示例#11
0
    def preprocess(self, query, delta, dayfirst=True, monthfirst=False, yearfirst=False):

        """
        This module performs basic preprocessing on the input query
        It replaces 'this month' with the current month name
                    'next year' with the next year
                    'yesterday' with the previous day's date
                    'day after/day after tomorrow' with 2 days from today's date
        Params:
            Input:
                query - str
                delta - str
                dayfirst - bool(true by default)[optional]
                monthfirst - bool(false by default)[optional]
                yearfirst - bool(false by default)[optional]
            Output:
                query - str
                delta - str
        """

        ptn = re.compile("(th|rd|st|nd)"
                         "(january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul|"
                         "august|aug|september|sept|sep|october|oct|november|nov|december|dec)")
        ptn = ptn.pattern.replace('\n', '').replace(' ', '')
        srch = re.search(ptn, query.lower())
        while srch:
            query = re.sub(srch.group(0), srch.group(1)+" "+srch.group(2), query, flags=re.I)
            srch = re.search(ptn, query)
        query = re.sub(r'\b(a day(s)?|a night(s)?)\b', '1 day', query, flags=re.I)
        query = re.sub(r'\b(a week(s))\b', '1 week', query, flags=re.I)
        query = re.sub(r'\b(a month(s)?)\b', '1 month', query, flags=re.I)
        query = re.sub(r'\b(a year(s)?)\b', '1 year', query, flags=re.I)
        
        query = list(_timelex(query))
        query = [each for each in query if each != ' ']
        query = ' '.join(query)
        query = ' '+query+' '
        
        query = self.fixed_delta_search(query)
        
        reg1 = regex.compile(r"""(?<!(january|jan|february|feb|march|mar|april|apr|may|
                             june|jun|july|jul|august|aug|september|sept|sep|october|
                             oct|november|nov|december|dec))(\s)
                             (\d+)(\s)
                             (\b(th|st|rd|nd)\b(\s))?
                             (-|\bto\b|\band\b|&)(\s)
                             (\d+)(\s)
                             (\b(th|st|rd|nd)\b(\s))?
                             (\bof\s\b)?
                             \b(january|jan|february|feb|march|mar|april|apr|may|june|
                             jun|july|jul|august|aug|september|sept|sep|october|oct|
                             november|nov|december|dec)\b""")
        reg1 = reg1.pattern.replace('\n', '').replace(' ', '')
        query = regex.sub(reg1, r" \3 th - \10 th \16 ", query, flags=re.I)

        reg2 = re.compile(r"""(?<!\d)(\s)
                          \b(january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul|
                          august|aug|september|sept|sep|october|oct|november|nov|december|dec)\b
                          (\s)(\d+)(\s)
                          (\b(th|st|rd|nd)\b(\s))?
                          (\bto\b|\band\b|-|&)(\s)
                          (\d+)(\s)
                          (\b(th|st|rd|nd)\b(\s))?
                          (?!(\.|:|\\|\/|-))""")
        reg2 = reg2.pattern.replace('\n', '').replace(' ', '')
        query = re.sub(reg2, r" \4 th - \11 th \2 ", query, flags=re.I)

        reg3 = re.compile(r"""(?<!(\d\s\.|\d\s:|\d\s\\|\d\s\/|\d\s-))(\s)
                          (\d+)(\s)(\bof\s\b)?\b(january|jan|february|feb|
                          march|mar|april|apr|may|june|jun|july|jul|august|aug|september|
                          sept|sep|october|oct|november|nov|december|dec)\b""")
        reg3 = reg3.pattern.replace('\n', '').replace(' ', '')
        query = re.sub(reg3, r" \3 th \6 ", query, flags=re.I)

        reg4 = regex.compile(r"""(?<!(\d\s|\bst\b\s|\bth\b\s|\brd\b\s|\bnd\b\s|\bof\b\s))
                             \b(january|jan|february|feb|march|mar|april|apr|may|june|jun|
                             july|jul|august|aug|september|sept|sep|october|oct|november|nov|
                             december|dec)\b(\s)(\d+)(\s)""")
        reg4 = reg4.pattern.replace('\n', '').replace(' ', '')
        query = regex.sub(reg4, r" \2 \4 th ", query, flags=re.I)

        query = self.date_format_reader(query, dayfirst, monthfirst, yearfirst)

        reg5 = re.compile(r"""(\d+)(\s+)?\b(nd|st|rd|th)\b(\s+)?
                          \b(january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul|
                          august|aug|september|sept|sep|october|oct|november|nov|december|dec|
                          of next month|next month|of this month|this month)\b
                          (\s+)?(\bto\b|\band\b|-)(\s+)?
                          (\d+)(\s+)?\b(nd|st|rd|th)\b(\s+)?
                          \b(january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul|
                          august|aug|september|sept|sep|october|oct|november|nov|december|dec|
                          of next month|next month|of this month|this month)\b(\s+)?(\d+)""")
        reg5 = reg5.pattern.replace('\n', '').replace(' ', '')
        query = re.sub(reg5, r"\1 \3 \5 \15 \7 \9 \11 \13 \15", query, flags=re.I)

        query = self.weekday_reader(query)

        this_year = datetime.datetime.now().year
        query = re.sub(r'(of this year|this year)', 'year ' + str(this_year), query, flags=re.I)
        
        next_year = datetime.datetime.now().year + 1
        query = re.sub(r'(of next year|next year)', 'year ' + str(next_year), query, flags=re.I)
        
        last_year = datetime.datetime.now().year - 1
        query = re.sub(r'(of (l|p)ast year|(l|p)ast year|of prev year|of previous year|prev year|previous year)',
                       'year ' + str(last_year), query, flags=re.I)
        print(query)
        
        this_month = datetime.datetime.now().strftime("%B")
        query = re.sub(r'(of this month|this month)', this_month, query, flags=re.I)
        
        next_month = (datetime.date.today() + relativedelta(months=1)).strftime("%B")
        query = re.sub(r'(of next month|next month)', next_month, query, flags=re.I)
        
        last_month = (datetime.date.today() - relativedelta(months=1)).strftime("%B")
        query = re.sub(r'(of last month|last month|of previous month|of prev month|previous month|prev month)',
                       last_month, query, flags=re.I)        
        
        temporal = [r'now', r'today', r'tonight', r'(?<!after\s)tomorrow', r'yesterday']
        days = re.findall(r'\b' + r'\b|\b'.join(temporal) + r'\b', query)
        if days:
            for day in days:
                if day == 'today' or day == 'now' or day == 'tonight':
                    query = query.replace(day, datetime.datetime.now().date().strftime("%d.%B.%Y"))
                if day == 'tomorrow':
                    query = re.sub(r'(?<!after\s)tomorrow', (datetime.date.today()
                                                             + relativedelta(days=1)).strftime("%d.%B.%Y"), query, flags=re.I)
                if day == 'yesterday':
                    query = query.replace(day, (datetime.date.today()
                                                - relativedelta(days=1)).strftime("%d.%B.%Y"))

        next_day = re.search(r'day after tomorrow|day after', query, re.I)
        if next_day:
            query = query[:next_day.span()[0]] + ((datetime.datetime.now()
                                                   + relativedelta(days=2)).date()).strftime("%d.%B.%Y") + query[next_day.span()[1]:]

        return (query, delta)
示例#12
0
    def timesplit(self, query):

        """
        This module will split query into tokens and filter out irrelevant words.
        It will only keep month names, and numbers that signify a date or a year.
        Params:
            Input:
                query - str
            Output:
                query - str
                split_query - list of lists
        """

        # import pdb;pdb.set_trace()
        split_query = list(_timelex(query))
        split_query = [x for x in split_query if x != ' ']
        year_jump = ['of', 'in', 'year', 'years']
        date_jump = ['to', 'of', '-']
        date_check = ['th', 'rd', 'st', 'nd', '.']
        llist = []
        print(split_query)
        now = datetime.datetime.now().year
        if len(split_query) > 1:
            for ind, each in enumerate(split_query):                
                if ind == 0:                    
                    if (split_query[ind].isdigit() and \
                        split_query[ind+1] in date_check) or \
                    self.info.month(split_query[ind]):
                        llist.append(split_query[ind])
                if ind == len(split_query) - 1:                    
                    if self.info.month(split_query[ind]):
                        llist.append(split_query[ind])
                    if split_query[ind].isdigit():
                        if split_query[ind - 1] in date_check + date_jump or \
                        self.info.month(split_query[ind - 1]) or \
                        (split_query[ind - 1] in year_jump and \
                        split_query[ind + 1] not in ["month", "months"]) or \
                        int(split_query[ind]) > 1000:
                            if int(split_query[ind]) > 1000:
                                llist.append(split_query[ind])
                            elif int(split_query[ind]) < 100:
                                future = int(split_query[ind]) + (datetime.datetime.now().year//100*100)
                                past = int(split_query[ind]) + (datetime.datetime.now().year//100*100 - 100)
                                rep = split_query[ind]
                                if (now - past) < (future - now):
                                    century = datetime.datetime.now().year//100*100 - 100
                                else:
                                    century = datetime.datetime.now().year//100*100
                                split_query[ind] = str(int(split_query[ind])+century)
                                query = query.replace(' '+rep+' ', ' '+split_query[ind]+' ')
                                split_query = [str(int(x)+century) if x == rep else x for x in split_query]
                                llist.append(split_query[ind])
                if ind > 0 and ind < len(split_query) - 1:
                    if self.info.month(split_query[ind]):
                        llist.append(split_query[ind])
                    elif split_query[ind].isdigit():
                        if split_query[ind + 1] in date_check and int(split_query[ind]) < 32:
                            llist.append(split_query[ind])
                        elif split_query[ind - 1] in date_check + date_jump or \
                        self.info.month(split_query[ind - 1]) or \
                        (split_query[ind - 1] in year_jump and \
                        split_query[ind + 1] not in ["month", "months"]) or \
                        int(split_query[ind]) > 1000:
                            if int(split_query[ind]) > 1000:
                                llist.append(split_query[ind])
                            elif int(split_query[ind]) < 100:
                                future = int(split_query[ind]) + (datetime.datetime.now().year//100*100)
                                past = int(split_query[ind]) + (datetime.datetime.now().year//100*100 - 100)
                                rep = split_query[ind]
                                if (now - past) < (future - now):
                                    century = datetime.datetime.now().year//100*100 - 100
                                else:
                                    century = datetime.datetime.now().year//100*100
                                split_query[ind] = str(int(split_query[ind])+century)
                                query = query.replace(' '+rep+' ', ' '+split_query[ind]+' ')
                                split_query = [str(int(x)+century) if x == rep else x for x in split_query]
                                llist.append(split_query[ind])
                    elif each not in year_jump + list(set(date_check)-set('.')) and not self.info.month(each) and \
                    (split_query[ind - 1].isdigit() or self.info.month(split_query[ind - 1]) or \
                     split_query[ind - 1] in list(set(date_check)-set('.')) + date_jump):
                        if each in ['and', 'to', '-']:
                            if split_query[ind - 1] not in list(set(date_check)-set('.')) and \
                            not split_query[ind + 1].isdigit():
                                llist.append('_')
                        elif each == '.':
                            if not (split_query[ind - 1].isdigit() and \
                                    self.info.month(split_query[ind + 1])) and \
                            not (self.info.month(split_query[ind - 1]) and \
                                 split_query[ind + 1].isdigit()):
                                llist.append('_')
                        else:
                            llist.append('_')
        else:
            if self.info.month(split_query[0]):
                llist.append(split_query[0])
        print(llist)
        split_query = []
        small = []
        for ind, each in enumerate(llist):
            if each == '_' or ind == len(llist)-1:
                if each != '_':
                    small.append(each)
                split_query.append(small)
                small = []
            else:
                small.append(each)
        split_query = [each for each in split_query if each != []]
        print(">>>>>>>",split_query)
        return split_query, query
示例#13
0
    def convertword2num(self, sent_list):
        """
        This is the main function where numbers are extracted from
        their alphabetic equivalents
        Params:
            Input:
                sent_list - list of broken sentences
            Output:
                new_list - same list of sentences, but with numbers
        """
        new_list = []
        print(sent_list)
        for sent in sent_list:
            is_a_part = False
            words = []
            temp_sent = [x for x in list(_timelex(sent)) if x != ' ']
            for ind, wrd in enumerate(temp_sent):
                word = ''
                if wrd in self.num_list + self.sim_list:
                    if not is_a_part:
                        is_a_part = True
                        if wrd in self.sim_list:
                            if ind == 0:
                                sent = sent.replace(wrd, u'one ' + wrd)
                                wrd = u'one ' + wrd
                                words.append(wrd)
                            elif number(temp_sent[ind - 1]) == 0:
                                sent = sent.replace(wrd, u'one ' + wrd)
                                wrd = u'one ' + wrd
                                words.append(wrd)
                        if ind != len(temp_sent) - 1 and \
                        number(temp_sent[ind]) in range(1, 21) and \
                        temp_sent[ind+1] in self.num_list and \
                        number(temp_sent[ind+1]) > 9:
                            sent = sent.replace(wrd, wrd + u' hundred')
                            wrd = wrd + u' hundred'
                            words.append(wrd)
                        elif wrd not in words:
                            words.append(wrd)
                    else:
                        if ind != len(temp_sent)-1 and \
                        number(temp_sent[ind]) in range(1, 21) and \
                        temp_sent[ind+1] in self.num_list and \
                        number(temp_sent[ind+1]) > 9:
                            sent = sent.replace(wrd, wrd + u' hundred')
                            wrd = wrd + u' hundred'
                            words.append(wrd)
                        else:
                            words.append(wrd)
                elif wrd == 'and' and temp_sent[ind - 1] in self.sim_list:
                    if is_a_part:
                        try:
                            if temp_sent[ind + 1] in self.num_list:
                                words.append(wrd)
                        except:
                            pass
                else:
                    is_a_part = False

            word = " ".join([word for word in words])
            word = list(_timelex(word))
            word = ' '.join([x for x in word if x != ' '])
            try:
                num = number(word)
                print(num)
            except Exception:
                num = word
            else:
                if word:
                    sent = sent.replace(word, str(num))
            new_list.append(sent)

        return new_list
示例#14
0
def build_test(i, test_string):
    python_tokens = list(_timelex(test_string))
    formatted_tokens = 'vec!["' + '", "'.join(python_tokens) + '"]'
    return f'''