def parse_date(self, default_year): while True: raw_input = self.read_from_user( 'Enter the date (%d %m, %d/%m, also with %y): ') parsed_date = DateParser( default_year=default_year).reformat(raw_input) if parsed_date is None: continue if self.is_validated_by_user(parsed_date): return parsed_date
def pre_process(self, tokenizer_type: str, tokenizer_types: List[str]): try: # read the text from NLTK self.read_raw_and_save() # get the title and body contents title, body = self.split_title_and_body() # reformat the body as the text in assignment description body = self.reformat_body(body) ############################ Pipeline ############################ # 1. tokenization if tokenizer_type == tokenizer_types[0]: # use basic regular expression tokenzier (not enhanced) from NLTK book chapter 3 pattern = r'''(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():-_`] # these are separate tokens; includes ], [ ''' elif tokenizer_type == tokenizer_types[1]: # use enhanced regular expression tokenizer based on basic regular expression tokenizer pattern = r'''(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S. | \$?\d+(?:,\d+)?(?:\.\d+)?%? # currency or percentages or numbers that include a comma and/or a period e.g. $12.50, 52.25%, 30,000, 3.1415, 1,655.8 | \w+(?:-\w+)* # words with optional internal hyphens e.g. state-of-the-art | \.\.\. # ellipsis ... | \'[sS] # tokenize "'s" together | [][.,;"'?():-_`] # these are separate tokens; include ], [ | \w+ # word characters ''' else: raise Exception("Error: tokenizer type \'" + str(tokenizer_type) + "\' does not exist in [" + (', '.join(tokenizer_types)) + '].') regex_tokenizer = RegexpTokenizer(pattern) title_tokens = regex_tokenizer.tokenize(title) body_tokens = regex_tokenizer.tokenize(body) print('Tokenization results:') print(title_tokens) print(body_tokens) print('---------------------------------------------') # 2. sentence splitting # ## use built-in tagged sentence (for clarify) body_sents = nltk.sent_tokenize(body) print('Sentences splitting results:') print(body_sents) print('---------------------------------------------') # 3. POS tagging pos_tags: List[List[str]] = list() for body_sent in body_sents: body_tokens = regex_tokenizer.tokenize(body_sent) body_pos_tags = nltk.pos_tag(body_tokens) pos_tags.append(body_pos_tags) print('Part-of-speech tagging results:') print(pos_tags) print('---------------------------------------------') # 4. number normalization # has implemented in `pattern` during tokenization step # measured entity detection ud = UnitEntityDetector(pos_tags) unit_entity = ud.unit_detection() # get a list of unit entities print('Measured entity detection:') print(unit_entity) print('---------------------------------------------') # 5. date recognition dr = DateRecognizer(pos_tags) dates = dr.recognize_dates() # get a list of detected dates print('Date recognition:') print(dates) print('---------------------------------------------') # 6. date parsing dp = DateParser(body_sents, pos_tags) print('Date parsing:') dp.date_parse(dates) # parse the dates detected by date recognizer except Exception as ex: print(ex.args[0])
def retrieve(self, parsedQuery): ''' 1. Find the relevant paragraphs 2. Find the relevant sentences. 3. Return the answer from the relevant sentences based on the expected answer type. ''' # Find the relevant paragraphs relevant_paragraphs = self.getRelevantParagraphs( parsedQuery['query_vector']) #print(relevant_paragraphs) # Retrieve all the sentences. all_sentences = [] for item in relevant_paragraphs: if item != None: para = self.data[item[0]] all_sentences.extend(sent_tokenize(para)) # Find the relevant sentences from the sentences. if len(all_sentences) == 0: return "My apologies! I don't know the answer." relevant_sentences = self.getRelevantSentences(all_sentences, parsedQuery, 1) # Unigram similarity #print(relevant_sentences) # Now we have gathered the relevant sentences, it's time to find the answer from them based on the answer type. answer_type = parsedQuery['answer_type'] # Set the default answer as the first sentence. answer = relevant_sentences[0][0] # Stemmer object stemmer = PorterStemmer() # Extract the answer. if answer_type == 'DEFINITION': retrieved_sentences = self.getRelevantSentences( all_sentences, parsedQuery, 1) answer = retrieved_sentences[0][0] # Company, Organization elif answer_type == 'ORGANIZATION': # Get the named entities. named_entities = self.getNamedEntities( [item[0] for item in relevant_sentences]) for entity in named_entities: if entity[0] == 'ORGANIZATION': answer = entity[1] answer_tokens = [ stemmer.stem(word.lower()) for word in word_tokenize(answer.lower()) ] question_tokens = [ stemmer.stem(word.lower()) for word in word_tokenize( parsedQuery['original_question'].lower()) ] if [(ans in question_tokens) for ans in answer_tokens].count(True) >= 1: continue break # Person elif answer_type == 'PERSON': named_entities = self.getNamedEntities( [item[0] for item in relevant_sentences]) for entity in named_entities: if entity[0] == 'PERSON': answer = entity[1] answer_tokens = [ stemmer.stem(word.lower()) for word in word_tokenize(answer.lower()) ] question_tokens = [ stemmer.stem(word.lower()) for word in word_tokenize( parsedQuery['original_question'].lower()) ] if [(ans in question_tokens) for ans in answer_tokens].count(True) >= 1: continue break # Location elif answer_type == 'LOCATION': named_entities = self.getNamedEntities( [item[0] for item in relevant_sentences]) for entity in named_entities: if entity[0] == 'GPE': answer = entity[1] answer_tokens = [ stemmer.stem(word.lower()) for word in word_tokenize(answer.lower()) ] question_tokens = [ stemmer.stem(word.lower()) for word in word_tokenize( parsedQuery['original_question'].lower()) ] if [(ans in question_tokens) for ans in answer_tokens].count(True) >= 1: continue break # Date elif answer_type == 'DATE': dp = DateParser() dates = [] for sentence in relevant_sentences: dates.extend(dp.extractDate(sentence[0])) if len(dates) > 0: answer = dates[0] # Other types elif answer_type in ['NN', 'NNP']: named_entities = self.getChunks( [item[0] for item in relevant_sentences]) for entity in named_entities: if answer_type == 'NN': if entity[0] == 'NN' or entity[0] == 'NNS': answer = entity[1] answer_tokens = [ stemmer.stem(word.lower()) for word in word_tokenize(answer.lower()) ] question_tokens = [ stemmer.stem(word.lower()) for word in word_tokenize( parsedQuery['original_question'].lower()) ] if [(ans in question_tokens) for ans in answer_tokens].count(True) >= 1: continue break elif answer_type == 'NNP': if entity[0] == 'NNP' or entity[0] == 'NNPS': answer = entity[1] answer_tokens = [ stemmer.stem(word.lower()) for word in word_tokenize(answer.lower()) ] question_tokens = [ stemmer.stem(word.lower()) for word in word_tokenize( parsedQuery['original_question'].lower()) ] if [(ans in question_tokens) for ans in answer_tokens].count(True) >= 1: continue break return answer
def setUp(self): self.date = DateParser()
class DateParserTest(unittest.TestCase): def setUp(self): self.date = DateParser() # The input file consists of a single line containing three integers # separated by "/". There are no extra spaces around the "/" ... def test_stringWith3Integers(self): dateString = "12/17/85" self.assertEqual(self.date.returnIntegers(dateString), [12, 17, 85]) def test_badStrings(self): dateStrings = ["sdklafj/12/97", "12/23/23/19", "12 /37/99"] for dateString in dateStrings: with self.assertRaises(DateParserException): self.date.returnIntegers(dateString) # ...between 0 and 2999, ... def test_dateInRange(self): dates = [[12, 34, 2011], [23,67,199], [1,1,2]] for date in dates: self.assertTrue(self.date.dateInRange(date)) def test_dateNotInRange(self): dates = [[12, 34, 3111], [-3,67,199], [1,1000000000,2]] for date in dates: self.assertFalse(self.date.dateInRange(date)) # At most one of the integers has four digits, and the others have one or two # digits. def test_validSizes(self): dates = [["3", "32", "8887"], ["31", "3486", "98"], ["98", "1", "2"]] for date in dates: self.assertTrue(self.date.atMostOneHave4Digits(date)) def test_invalidSizes(self): dates = [["3", "832", "8887"], ["831", "3486", "98"], ["88898", "1", "2"], ["8898", "1", ""]] for date in dates: self.assertFalse(self.date.atMostOneHave4Digits(date)) def test_atMostOneHave4Digits(self): dates = [["23", "32", "87"], ["31", "3486", "98"]] for date in dates: self.assertTrue(self.date.atMostOneHave4Digits(date)) def test_moreThanOneHave4Digits(self): dates = [["23", "2232", "2287"], ["1131", "3486", "98"], ["1131", "3486", "8998"]] for date in dates: self.assertFalse(self.date.atMostOneHave4Digits(date)) def test_getEarlyDate(self): dates = ["02/4/67", "31/9/73", "2014/2/29"] results = ["2067-02-04", "31/9/73 is illegal", "2014/2/29 is illegal"] for date, result in izip(dates, results): self.assertEqual(self.date.parseDate(date), result)
class Evaluator(object): # time in minutes after that an edit on the item page is not counted anymore when calculating how many changes occured after an view on a edit page intervall = 10 def __init__(self, log): super(Evaluator, self).__init__() self.log_file_path = log self.date_parser = DateParser() # writes to a csv file every change thar occured during a session # the script takes the info it needs from the log file passed in its constructor # might also write another file (or in the same file) which states wheather a change occured on the items page during a intervall (see self.intervall) def _summarize_summary(self, result): summary = {'violation': 0, 'compliance': 0, 'exception': 0} for res in result: summary['violation'] += result[res]['violation'] summary['compliance'] += result[res]['compliance'] summary['exception'] += result[res]['exception'] return summary def _find_index_of_latest_visit_of_session(self, lines, special_page, entity_id, start_time, start_index): # TODO: return latest index... return 0 def _delete_unneeded_entries_for_session(self, lines, special_page, entity_id, i, end_index): # TODO: delete... return lines def run(self): csv_file = open( "csv/evaluation" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".csv", "wb") csv_writer = csv.writer(csv_file) lines = [line.strip() for line in open(self.log_file_path)] i = 0 while i < len(lines): if lines[i].find("unittest") != -1: continue log_entry = json.loads(lines[i][lines[i].find("{"):]) special_page = log_entry["special_page_id"] entity_id = log_entry["entity_id"] start = self.date_parser.get_date_from_timestamp( log_entry["insertion_timestamp"]) end_index = self.find_index_of_latest_visit_of_session( lines, special_page, entity_id, start, i) lines = self.delete_unneeded_entries_for_session( lines, special_page, entity_id, i, end_index) result_summary = self.summarize_summary( log_entry["result_summary"]) csv_writer.writerow( (special_page, entity_id, start, result_summary['violation'], result_summary['compliance'], result_summary['exception'])) # repeat until list is empty # take first entry, memorize result, SPid and entityId and search last visit entry that belongs to session # delete every entry with this SPid and entityId until last, search all belonging job entries, memorize results, delete them and every job entry before # build Session object, write this entry to csv file csv_file.close()
def __init__(self, log): super(Evaluator, self).__init__() self.log_file_path = log self.date_parser = DateParser()
def test_parse_complete_date_separated_by_spaces(self): self.assertEqual(DateParser().reformat("3 9 2020"), "2020-09-03")
def test_parse_incomplete_date_separated_by_spaces(self): self.assertEqual(DateParser(default_year="2020").reformat("3 9"), "2020-09-03")
) #Builds the objects and returns the list of RDO objects SexObject.append(2) DataList.append(SexObject) Counter += 1 elif DataType == 3: print('Picked 3') DiseaseObject = DiseaseParser(rows, ID_Table).getRDO( ) #Builds the objects and returns the list of RDO objects DiseaseObject.append(3) DataList.append(DiseaseObject) Counter += 1 elif DataType == 4: print('Picked 4') DateObject = DateParser(rows, ID_Table).getRDO( ) #Builds the objects and returns the list of RDO objects DateObject.append(4) DataList.append(DateObject) Counter += 1 elif DataType == 5: print('Picked 5') RateObject = incidenceMortalityPrevalenceRate(rows, ID_Table).getRDO( ) # Builds the objects and returns the list of RDO objects RateObject.append(6) # yeah i know picked 5... but id 5 is for ID DataList.append(RateObject) Counter += 1 else: print('I see you want the hidden option... too bad it does not exist!')