def string_blocks_to_events(string_blocks, p = None): """Given a set of string blocks (as produced by html_to_string_blocks, expects that all strings are non-empty), returns a list of timeline events. A timeline event is {date: number, date_string: string, content: string} """ curr_ignore_sections = _ignore_sections.copy() p = param_defaults(p or {}) def section_test(name): if p['single_section']: return name.strip().lower() == p['single_section'].strip().lower() else: return name.strip().lower() not in curr_ignore_sections if all(not section_test(sb['heading'][0]) for sb in string_blocks): # allow the first section to be processed if it is the only section, # excluding excluded sections like see also, etc. Usually this section # is just an intro paragraph, but if this if statement is true, it is # probably the entire content of the article try: curr_ignore_sections.remove('') except KeyError: pass if p['extra_ignore_sections']: for s in p['extra_ignore_sections'].split('&'): curr_ignore_sections.add(s.lower().strip()) curr_event = None events = [] for string_block in string_blocks: prev_date = None if section_test(string_block['heading'][0]): # create base date based on headings: # possible perf improvement by caching results for headings across string_blocks base_date = TimelineDate(TimePoint()) base_date_string = '' for h in string_block['heading']: parse = parse_date_html(h) if parse: base_date = TimelineDate.combine(base_date, parse[0]) base_date_string = parse[1] # if there's a year specified in the headings, we create a fuzzy # range that child elements of those headings need to fall in base_date_range = None if base_date.start_year() != None: delta_minus = 10 delta_plus = 20 m = re.search(ur'0+$', str(base_date.start.year)) if m: delta_minus = int('1' + ('0' * (m.end() - m.start()))) delta_plus = delta_minus * 2 base_date_range = (base_date.start_year() - delta_minus, base_date.start_year() + delta_plus) for line in string_block['lines']: if line['line_type'] == LineTypes.line: parse = parse_date_html(line['line']) # if we can parse a date, create a new event if parse and \ ((not base_date_range) or \ (parse[0].start_year() == None) or \ (base_date_string.lower().strip() == 'antiquity') or \ (parse[0].start_year() >= base_date_range[0] and \ parse[0].start_year() <= base_date_range[1]) or \ (TimelineDate.can_combine_as_day(base_date, parse[0])) ): _close_event(events, curr_event) date = parse[0] if date.start_year() == None and prev_date: # this is the case where we have a month or # monthday but no year. in this case, take it from # the previous event date = TimelineDate.combine(prev_date, date) date = TimelineDate.combine(base_date, date) curr_event = { 'date': date.start_year(), 'date_length': date.length(), 'date_string': parse[1], 'content': parse[2] } prev_date = date # if we can't parse a date, append the line to the # current event if there is one elif curr_event: if p['continuations']: curr_event['content'] += _line_break + line['line'] else: _close_event(events, curr_event) curr_event = { 'date': curr_event['date'], 'date_length': curr_event['date_length'], 'date_string': curr_event['date_string'], 'content': line['line'] } # if there's no parse and no current event, see if we can # use the base_date elif base_date.start_year() != None: # no need to close events because curr_event is None curr_event = { 'date': base_date.start_year(), 'date_length': base_date.length(), 'date_string': base_date_string, 'content': line['line'] } elif line['line_type'] == LineTypes.table: _close_event(events, curr_event) events += _table_to_events(line['line'], base_date, p) curr_event = None _close_event(events, curr_event) curr_event = None return events