Exemplo n.º 1
def parse_date_text(text):
	"""Parses a date from some text, returns (TimelineDate, index) where
	text[:index] is the text determined to be the date. If no date can be
	found, returns None. Assumes that date is at the beginning of the string
	with no superfluous characters.
	text = text.lower()

	date_parser = BottomUpChartParser(parse_cfg(date_grammar_string))

	parses = []

	for date_text in _possible_texts(text):
		parses = date_parser.nbest_parse(date_text)
		if parses:

	if not parses:
		return None
	# these are all very closely tied to date_grammar
	def numstr(nume): # returns string of digits
		return ''.join(l for l in nume.leaves() if l.isdigit())
	def month(month): # returns TimePoint
		return TimePoint(month = month_to_num(month[0].node))
	def monthday(monthday): # returns TimePoint
		d = None
		for n in monthday:
			if hasattr(n, 'node'):
				if n.node == 'DAY':
					d = int(numstr(n))
				elif n.node == 'MONTH':
					m = month(n)
				elif n.node == 'MONTHNUM': # only one of MONTH and MONTHNUM should be present
					m = TimePoint(month = int(numstr(n)))
		m.day = d
		return m
	def num(num): # returns TimePoint
		if num[0].node == 'NUME':
			return TimePoint(int(numstr(num[0])))
		elif num[0].node == 'NUMQ':
			return TimePoint(
				[int(numstr(n)) for n in num[0] if n.node == 'NUME'][0],
				year_approx = True)
	def dece(dece): # returns number
		if len(dece) == 1: return int(numstr(dece[0]))
		else: return float(numstr(dece[0]) + '.' + numstr(dece[2]))
	def dec(dec): # returns TimePoint
		if dec[0].node == 'DECE':
			return TimePoint(dece(dec[0]))
		elif dec[0].node == 'DECQ':
			return TimePoint([dece(n) for n in dec[0] if n.node == 'DECE'][0], year_approx = True)
		elif dec[0].node == 'DECQQ':
			return TimePoint(dece(dec[0][0]), year_approx = dece(dec[0][4]))
	def period(period): # returns TimelineDate
		isad = period.node == 'PERIODAD'

		n = num(period[0][0][0])
		if period[0][2].node == 'century': factor = 100
		elif period[0][2].node == 'millenium': factor = 1000

		if isad:
			return TimelineDate(n * factor - factor, n * factor)
			return TimelineDate(-n * factor + 1, -n * factor + factor + 1)
	def yadyymymd(yad): # returns TimePoint
		# name stands for year AD: year, year month, year month day
		monthtp = None
		daynum = None
		for s in yad.subtrees():
			if s.node == 'NUM':
				yeartp = num(s)
			elif s.node == 'YADPRECISEYEAR':
				# there should never be a NUM and a YADPRECISEYEAR
				yeartp = TimePoint(int(numstr(s)))
			elif s.node == 'MONTH':
				monthtp = month(s)
			elif s.node == 'DAY':
				daynum = int(numstr(s))
		if monthtp: yeartp.month = monthtp.month
		if daynum != None: yeartp.day = daynum
		return yeartp
	def year(year): # returns TimePoint
		if year.node == 'YBC': return -num(year[0]) + 1 # because we are using astronomical years in which x BC is stored as (-x + 1)
		elif year.node == 'YAD': return yadyymymd(year)
	def _has_child_node(n, label):
		return [i for i, c in enumerate(n) if hasattr(c, 'node') and c.node == label]
	def daterange(r): # returns TimelineDate
		if r[0][0].node == 'YAD' and r[2][0].node == 'YAD':
			# for cases like 1832-34. gets parsed as YAD to YAD, but we need
			# to modify the second node
			first = date(r[0])
			second = date(r[2])
			first_str = str(first.start.year)
			second_str = str(second.start.year)
			if len(second_str) < len(first_str):
				second.start.year = int(
					first_str[:len(first_str) - len(second_str)] + second_str)
		elif r[0].node == 'DATE' and r[0][0].node != 'YAD' and r[0][0].node != 'PERIODAD':
			# these cases should work without any modification to either node
			first = date(r[0])
			second = date(r[2])
			# for cases like 34-12 b.c. or 12 century to 10th century bc. Gets
			# parsed as YAD to YBC or NUM to YBC. replacement_node finds the
			# '34' in the AST for YAD, and puts it in the corresponding
			# location for a copy of the YBC ast. This gets us a fully
			# qualified date for '34' that we can use to create the range
			if r[0][0].node == 'YAD':
				replacement_node = r[0][0].subtrees(filter = lambda x: x.node == 'NUM').next()
			elif r[0][0].node == 'PERIODAD':
				replacement_node = r[0][0][0][0]
			elif r[0].node == 'ORD':
				replacement_node = r[0]
				raise RuntimeError('unexpected node type in date AST')
			# copy the second date's AST, replace, and get the date
			first_mock = r[2].copy(True) #deepcopy
			parent = next(first_mock.subtrees(lambda t: _has_child_node(t, replacement_node.node)))
			parent[_has_child_node(parent, replacement_node.node)[0]] = replacement_node
			first = date(first_mock)

			second = date(r[2])

		return TimelineDate.span_from_dates(first, second)
	def date(date): # returns TimelineDate
		if date[0].node == 'YAD' or date[0].node == 'YBC':
			return TimelineDate(year(date[0]))
		elif date[0].node == 'PERIODAD' or date[0].node == 'PERIODBC':
			return period(date[0])
	def yearsago(yearsago): # returns TimelineDate
		# assume years ago means years ago from Jan 1 1950
		if yearsago[0].node == 'YAS':
			return TimelineDate(-num(yearsago[0][0]) + 1950)
		elif yearsago[0].node == 'YAR':
			return TimelineDate(-num(yearsago[0][0]) + 1950, -num(yearsago[0][2]) + 1950)
		elif yearsago[0].node == 'KAS':
			return TimelineDate(-dec(yearsago[0][0]) * 1000 + 1950)
		elif yearsago[0].node == 'KAR':
			return TimelineDate(-dec(yearsago[0][0]) * 1000 + 1950, -dec(yearsago[0][2]) * 1000 + 1950)
		elif yearsago[0].node == 'MAS':
			return TimelineDate(-dec(yearsago[0][0]) * 1000000 + 1950)
		elif yearsago[0].node == 'MAR':
			return TimelineDate(-dec(yearsago[0][0]) * 1000000 + 1950, -dec(yearsago[0][2]) * 1000000 + 1950)
	def monthdayrange(r): # returns TimelineDate
		copy_from_first = False
		second = None
		if r[2].node == 'DAY':
			second = TimePoint(day = int(numstr(r[2])))
			copy_from_first = True
		elif r[2].node == 'MONTHDAY':
			second = monthday(r[2])
		elif r[2].node == 'YADYEARMONTH' or r[2].node == 'YADYEARMONTHDAY':
			second = yadyymymd(r[2])

		if r[0].node == 'DAY':
			first = TimePoint(second.year, second.month, int(numstr(r[0])), year_approx = second.year_approx)
		elif r[0].node == 'MONTH':
			first = TimePoint(second.year, month(r[0]).month, year_approx = second.year_approx)
		elif r[0].node == 'MONTHDAY':
			temp = monthday(r[0])
			first = TimePoint(second.year, temp.month, temp.day, year_approx = second.year_approx)

		if copy_from_first:
			second.year = first.year
			second.month = first.month

		return TimelineDate(first, second)
	def monthdayyearrange(r): # returns TimelineDate
		yeartp = TimePoint(int(numstr(r[4])))
		monthdaytp = monthday(r[0])
		return TimelineDate(
			TimePoint(yeartp.year, monthdaytp.month, monthdaytp.day, year_approx = yeartp.year_approx),
			TimePoint(yeartp.year, monthdaytp.month, int(numstr(r[2])), year_approx = yeartp.year_approx))
	def timename(timename): # returns TimelineDate
		if timename[0].node == 'antiquity':
			return TimelineDate(TimePoint(-750), TimePoint(450))

	parse = None
	if len(parses) > 1:
		# ambiguous parses will fall into 3 categories
		# 1. DATE/MONTHDAY ambiguity
		# for a string like December 3:
		#	MONTHDAY (prefer)
		# or 3 December
		# for a string like 6 June - 3 October 2013
		# (this is almost the same thing as problem 1)
		# The other category is something like 3 December 4
		#	YADYEARMONTHDAY -> MONTHDAY ocommadotsp YADYEAR (prefer)
		# this should be extremely rare, and we will not bother dealing with these issues
		# this can also happen in DATERANGE

		temp = [p for p in parses if p[0].node == 'MONTHDAY']
		if len(temp) == 1:
			parse = temp[0]
		if not parse:
			temp = [p for p in parses if p[0].node == 'MONTHDAYRANGE']
			if len(temp) == 1:
				parse = temp[0]
		if not parse:
			warnings.warn('not sure how to decide between multiple parses %s' % date_text)
			parse = parses[0]
		parse = parses[0]

	if parse[0].node == 'DATE':
		result = date(parse[0])
	elif parse[0].node == 'YEARSAGO':
		result = yearsago(parse[0])
	elif parse[0].node == 'DATERANGE':
		result = daterange(parse[0])
	elif parse[0].node == 'MONTH':
		result = TimelineDate(month(parse[0]))
	elif parse[0].node == 'MONTHDAY':
		result = TimelineDate(monthday(parse[0]))
	elif parse[0].node == 'MONTHDAYRANGE':
		result = monthdayrange(parse[0])
	elif parse[0].node == 'MONTHDAYYEARRANGE':
		result = monthdayyearrange(parse[0])
	elif parse[0].node == 'TIMENAME':
		result = timename(parse[0])

	return (result, len(date_text))