def parse_date(self, default_year):
     while True:
         raw_input = self.read_from_user(
             'Enter the date (%d %m, %d/%m, also with %y): ')
         parsed_date = DateParser(
             default_year=default_year).reformat(raw_input)
         if parsed_date is None:
             continue
         if self.is_validated_by_user(parsed_date):
             return parsed_date
Пример #2
0
    def pre_process(self, tokenizer_type: str, tokenizer_types: List[str]):
        try:
            # read the text from NLTK
            self.read_raw_and_save()

            # get the title and body contents
            title, body = self.split_title_and_body()

            # reformat the body as the text in assignment description
            body = self.reformat_body(body)

            ############################ Pipeline ############################
            # 1. tokenization
            if tokenizer_type == tokenizer_types[0]:
                # use basic regular expression tokenzier (not enhanced) from NLTK book chapter 3
                pattern = r'''(?x)              # set flag to allow verbose regexps
                        (?:[A-Z]\.)+            # abbreviations, e.g. U.S.A.
                      | \w+(?:-\w+)*            # words with optional internal hyphens
                      | \$?\d+(?:\.\d+)?%?      # currency and percentages, e.g. $12.40, 82%
                      | \.\.\.                  # ellipsis
                      | [][.,;"'?():-_`]        # these are separate tokens; includes ], [
                      '''
            elif tokenizer_type == tokenizer_types[1]:
                # use enhanced regular expression tokenizer based on basic regular expression tokenizer
                pattern = r'''(?x)                  # set flag to allow verbose regexps
                        (?:[A-Z]\.)+                # abbreviations, e.g. U.S.
                      | \$?\d+(?:,\d+)?(?:\.\d+)?%? # currency or percentages or numbers that include a comma and/or a period e.g. $12.50, 52.25%, 30,000, 3.1415, 1,655.8
                      | \w+(?:-\w+)*                # words with optional internal hyphens e.g. state-of-the-art
                      | \.\.\.                      # ellipsis ...
                      | \'[sS]                      # tokenize "'s" together
                      | [][.,;"'?():-_`]            # these are separate tokens; include ], [
                      | \w+                         # word characters
                      '''
            else:
                raise Exception("Error: tokenizer type \'" + str(tokenizer_type) + "\' does not exist in [" + (', '.join(tokenizer_types)) + '].')

            regex_tokenizer = RegexpTokenizer(pattern)
            title_tokens = regex_tokenizer.tokenize(title)
            body_tokens = regex_tokenizer.tokenize(body)
            print('Tokenization results:')
            print(title_tokens)
            print(body_tokens)
            print('---------------------------------------------')

            # 2. sentence splitting
            # ## use built-in tagged sentence (for clarify)
            body_sents = nltk.sent_tokenize(body)
            print('Sentences splitting results:')
            print(body_sents)
            print('---------------------------------------------')

            # 3. POS tagging
            pos_tags: List[List[str]] = list()
            for body_sent in body_sents:
                body_tokens = regex_tokenizer.tokenize(body_sent)
                body_pos_tags = nltk.pos_tag(body_tokens)
                pos_tags.append(body_pos_tags)
            print('Part-of-speech tagging results:')
            print(pos_tags)
            print('---------------------------------------------')

            # 4. number normalization
            # has implemented in `pattern` during tokenization step

            # measured entity detection
            ud = UnitEntityDetector(pos_tags)
            unit_entity = ud.unit_detection()   # get a list of unit entities
            print('Measured entity detection:')
            print(unit_entity)
            print('---------------------------------------------')

            # 5. date recognition
            dr = DateRecognizer(pos_tags)
            dates = dr.recognize_dates()    # get a list of detected dates
            print('Date recognition:')
            print(dates)
            print('---------------------------------------------')

            # 6. date parsing
            dp = DateParser(body_sents, pos_tags)
            print('Date parsing:')
            dp.date_parse(dates) # parse the dates detected by date recognizer

        except Exception as ex:
            print(ex.args[0])
Пример #3
0
    def retrieve(self, parsedQuery):
        '''
      1. Find the relevant paragraphs
      2. Find the relevant sentences.
      3. Return the answer from the relevant sentences based on the expected answer type.
    '''
        # Find the relevant paragraphs
        relevant_paragraphs = self.getRelevantParagraphs(
            parsedQuery['query_vector'])
        #print(relevant_paragraphs)

        # Retrieve all the sentences.
        all_sentences = []
        for item in relevant_paragraphs:
            if item != None:
                para = self.data[item[0]]
                all_sentences.extend(sent_tokenize(para))

        # Find the relevant sentences from the sentences.
        if len(all_sentences) == 0:
            return "My apologies! I don't know the answer."

        relevant_sentences = self.getRelevantSentences(all_sentences,
                                                       parsedQuery,
                                                       1)  # Unigram similarity
        #print(relevant_sentences)
        # Now we have gathered the relevant sentences, it's time to find the answer from them based on the answer type.
        answer_type = parsedQuery['answer_type']
        # Set the default answer as the first sentence.
        answer = relevant_sentences[0][0]
        # Stemmer object
        stemmer = PorterStemmer()

        # Extract the answer.
        if answer_type == 'DEFINITION':
            retrieved_sentences = self.getRelevantSentences(
                all_sentences, parsedQuery, 1)
            answer = retrieved_sentences[0][0]
        # Company, Organization
        elif answer_type == 'ORGANIZATION':
            # Get the named entities.
            named_entities = self.getNamedEntities(
                [item[0] for item in relevant_sentences])
            for entity in named_entities:
                if entity[0] == 'ORGANIZATION':
                    answer = entity[1]
                    answer_tokens = [
                        stemmer.stem(word.lower())
                        for word in word_tokenize(answer.lower())
                    ]
                    question_tokens = [
                        stemmer.stem(word.lower()) for word in word_tokenize(
                            parsedQuery['original_question'].lower())
                    ]
                    if [(ans in question_tokens)
                            for ans in answer_tokens].count(True) >= 1:
                        continue
                    break
        # Person
        elif answer_type == 'PERSON':
            named_entities = self.getNamedEntities(
                [item[0] for item in relevant_sentences])
            for entity in named_entities:
                if entity[0] == 'PERSON':
                    answer = entity[1]
                    answer_tokens = [
                        stemmer.stem(word.lower())
                        for word in word_tokenize(answer.lower())
                    ]
                    question_tokens = [
                        stemmer.stem(word.lower()) for word in word_tokenize(
                            parsedQuery['original_question'].lower())
                    ]
                    if [(ans in question_tokens)
                            for ans in answer_tokens].count(True) >= 1:
                        continue
                    break
        # Location
        elif answer_type == 'LOCATION':
            named_entities = self.getNamedEntities(
                [item[0] for item in relevant_sentences])
            for entity in named_entities:
                if entity[0] == 'GPE':
                    answer = entity[1]
                    answer_tokens = [
                        stemmer.stem(word.lower())
                        for word in word_tokenize(answer.lower())
                    ]
                    question_tokens = [
                        stemmer.stem(word.lower()) for word in word_tokenize(
                            parsedQuery['original_question'].lower())
                    ]
                    if [(ans in question_tokens)
                            for ans in answer_tokens].count(True) >= 1:
                        continue
                    break
        # Date
        elif answer_type == 'DATE':
            dp = DateParser()
            dates = []
            for sentence in relevant_sentences:
                dates.extend(dp.extractDate(sentence[0]))
            if len(dates) > 0:
                answer = dates[0]
        # Other types
        elif answer_type in ['NN', 'NNP']:
            named_entities = self.getChunks(
                [item[0] for item in relevant_sentences])
            for entity in named_entities:
                if answer_type == 'NN':
                    if entity[0] == 'NN' or entity[0] == 'NNS':
                        answer = entity[1]
                        answer_tokens = [
                            stemmer.stem(word.lower())
                            for word in word_tokenize(answer.lower())
                        ]
                        question_tokens = [
                            stemmer.stem(word.lower())
                            for word in word_tokenize(
                                parsedQuery['original_question'].lower())
                        ]
                        if [(ans in question_tokens)
                                for ans in answer_tokens].count(True) >= 1:
                            continue
                        break

                elif answer_type == 'NNP':
                    if entity[0] == 'NNP' or entity[0] == 'NNPS':
                        answer = entity[1]
                        answer_tokens = [
                            stemmer.stem(word.lower())
                            for word in word_tokenize(answer.lower())
                        ]
                        question_tokens = [
                            stemmer.stem(word.lower())
                            for word in word_tokenize(
                                parsedQuery['original_question'].lower())
                        ]
                        if [(ans in question_tokens)
                                for ans in answer_tokens].count(True) >= 1:
                            continue
                        break

        return answer
Пример #4
0
 def setUp(self):
   self.date = DateParser()
Пример #5
0
class DateParserTest(unittest.TestCase):

  def setUp(self):
    self.date = DateParser()

  # The input file consists of a single line containing three integers
  # separated by "/". There are no extra spaces around the "/" ...

  def test_stringWith3Integers(self):

    dateString = "12/17/85"
    self.assertEqual(self.date.returnIntegers(dateString), [12, 17, 85])

  def test_badStrings(self):
    dateStrings = ["sdklafj/12/97", "12/23/23/19", "12 /37/99"]

    for dateString in dateStrings:
      with self.assertRaises(DateParserException):
        self.date.returnIntegers(dateString)

  # ...between 0 and 2999, ... 

  def test_dateInRange(self):
    dates = [[12, 34, 2011], [23,67,199], [1,1,2]]

    for date in dates:
      self.assertTrue(self.date.dateInRange(date))

  def test_dateNotInRange(self):
    dates = [[12, 34, 3111], [-3,67,199], [1,1000000000,2]]

    for date in dates:
      self.assertFalse(self.date.dateInRange(date))

  # At most one of the integers has four digits, and the others have one or two
  # digits.
  
  def test_validSizes(self):
    dates = [["3", "32", "8887"], ["31", "3486", "98"], ["98", "1", "2"]]

    for date in dates:
      self.assertTrue(self.date.atMostOneHave4Digits(date))
  
  def test_invalidSizes(self):
    dates = [["3", "832", "8887"], ["831", "3486", "98"], ["88898", "1", "2"], ["8898", "1", ""]]

    for date in dates:
      self.assertFalse(self.date.atMostOneHave4Digits(date))
  
  def test_atMostOneHave4Digits(self):
    dates = [["23", "32", "87"], ["31", "3486", "98"]]

    for date in dates:
      self.assertTrue(self.date.atMostOneHave4Digits(date))

  def test_moreThanOneHave4Digits(self):
    dates = [["23", "2232", "2287"], ["1131", "3486", "98"], ["1131", "3486", "8998"]]

    for date in dates:
      self.assertFalse(self.date.atMostOneHave4Digits(date))

  def test_getEarlyDate(self):
    dates = ["02/4/67", "31/9/73", "2014/2/29"]
    results = ["2067-02-04", "31/9/73 is illegal", "2014/2/29 is illegal"]

    for date, result in izip(dates, results):
      self.assertEqual(self.date.parseDate(date), result)
Пример #6
0
class Evaluator(object):

    # time in minutes after that an edit on the item page is not counted anymore when calculating how many changes occured after an view on a edit page
    intervall = 10

    def __init__(self, log):
        super(Evaluator, self).__init__()
        self.log_file_path = log
        self.date_parser = DateParser()

    # writes to a csv file every change thar occured during a session
    # the script takes the info it needs from the log file passed in its constructor

    # might also write another file (or in the same file) which states wheather a change occured on the items page during a intervall (see self.intervall)

    def _summarize_summary(self, result):
        summary = {'violation': 0, 'compliance': 0, 'exception': 0}
        for res in result:
            summary['violation'] += result[res]['violation']
            summary['compliance'] += result[res]['compliance']
            summary['exception'] += result[res]['exception']

        return summary

    def _find_index_of_latest_visit_of_session(self, lines, special_page,
                                               entity_id, start_time,
                                               start_index):

        # TODO: return latest index...
        return 0

    def _delete_unneeded_entries_for_session(self, lines, special_page,
                                             entity_id, i, end_index):
        # TODO: delete...
        return lines

    def run(self):
        csv_file = open(
            "csv/evaluation" +
            datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".csv",
            "wb")
        csv_writer = csv.writer(csv_file)
        lines = [line.strip() for line in open(self.log_file_path)]

        i = 0
        while i < len(lines):
            if lines[i].find("unittest") != -1:
                continue
            log_entry = json.loads(lines[i][lines[i].find("{"):])
            special_page = log_entry["special_page_id"]
            entity_id = log_entry["entity_id"]
            start = self.date_parser.get_date_from_timestamp(
                log_entry["insertion_timestamp"])
            end_index = self.find_index_of_latest_visit_of_session(
                lines, special_page, entity_id, start, i)
            lines = self.delete_unneeded_entries_for_session(
                lines, special_page, entity_id, i, end_index)
            result_summary = self.summarize_summary(
                log_entry["result_summary"])
            csv_writer.writerow(
                (special_page, entity_id, start, result_summary['violation'],
                 result_summary['compliance'], result_summary['exception']))

        # repeat until list is empty
        # 	take first entry, memorize result, SPid and entityId and search last visit entry that belongs to session
        # 	delete every entry with this SPid and entityId until last, search all belonging job entries, memorize results, delete them and every job entry before
        # 	build Session object, write this entry to csv file

        csv_file.close()
Пример #7
0
 def __init__(self, log):
     super(Evaluator, self).__init__()
     self.log_file_path = log
     self.date_parser = DateParser()
 def test_parse_complete_date_separated_by_spaces(self):
     self.assertEqual(DateParser().reformat("3 9 2020"), "2020-09-03")
 def test_parse_incomplete_date_separated_by_spaces(self):
     self.assertEqual(DateParser(default_year="2020").reformat("3 9"), "2020-09-03")
Пример #10
0
        )  #Builds the objects and returns the list of RDO objects
        SexObject.append(2)
        DataList.append(SexObject)
        Counter += 1

    elif DataType == 3:
        print('Picked 3')
        DiseaseObject = DiseaseParser(rows, ID_Table).getRDO(
        )  #Builds the objects and returns the list of RDO objects
        DiseaseObject.append(3)
        DataList.append(DiseaseObject)
        Counter += 1

    elif DataType == 4:
        print('Picked 4')
        DateObject = DateParser(rows, ID_Table).getRDO(
        )  #Builds the objects and returns the list of RDO objects
        DateObject.append(4)
        DataList.append(DateObject)
        Counter += 1

    elif DataType == 5:
        print('Picked 5')
        RateObject = incidenceMortalityPrevalenceRate(rows, ID_Table).getRDO(
        )  # Builds the objects and returns the list of RDO objects
        RateObject.append(6)  # yeah i know picked 5... but id 5 is for ID
        DataList.append(RateObject)
        Counter += 1

    else:
        print('I see you want the hidden option... too bad it does not exist!')