def find_turns(topic, turn_section): #This is the regular expression to find the turn of a topic #This is the regular expression to find out the user and date of a turn author_reg = '\[\[User(\s*talk)*:(?P<name>.*?)\|(?P<alias>.*?)\]\]' author_reg_c = re.compile(author_reg) user_date_reg = '\[\[User(\s*talk)*:(?P<name>.*?)'\ '\|(?P<alias>.*?)\]\]'\ '[\s\|]*(\(*\[\[.*\]\]\)*)*\s*'\ '(?P<date>[a-zA-Z0-9i/,:\-\s]+)?' user_date_reg_c= re.compile(user_date_reg) #try to different turn indent from list indent list_reg = '[:\*#;\d]+' list_reg_c = re.compile(list_reg) #this indicate turn indent as a discussion thread thread_reg = ':+' thread_reg_c = re.compile(thread_reg) #Find the potential turn list based on line break #The turn list returned mightnot be the actual one, because #some authors add linebreak in their messages #we may need to combine a few contiguous turns to form a #complete turn #We know a turn is complete if we see user information at the #end, or if the next candidate starts with ":", a wiki markup #indicating indent and follow up thread # turn_text_pre = '' turn_finish = True turn_list = turn_section.split('\n') for turn_text in turn_list: #pdb.set_trace() turn_text = turn_text.strip() if len(turn_text) == 0: continue user = user_date_reg_c.search(turn_text) if user is not None: if turn_text.startswith(':'): if list_reg_c.match(turn_text) is None: if not turn_finish: turn = Turn(turn_text_pre.strip()) topic.add_turn(turn) turn_text_pre = '' turn_finish = True if not turn_finish: turn_text = turn_text_pre + '\n' + turn_text #if there are multiple matching, we take th e last one #pdb.set_trace() users = user_date_reg_c.finditer(turn_text) for match in users: date_str = match.group('date') author = match.group('name') date = None if date_str is not None: date = str_to_datetime(date_str) if date is not None: turn = Turn(turn_text.strip(), author,date) else: turn = Turn(turn_text.strip(), author) topic.add_turn(turn) turn_text_pre = '' turn_finish = True else: # a normal paragraph add it to previous one if not turn_text.startswith(':'): turn_text_pre = turn_text_pre+ '\n' + turn_text turn_finish = False else: # check if it is just a list if list_reg_c.match(turn_text) is not None: turn_text_pre = turn_text_pre + '\n' + turn_text turn_finish = False # this is a new turn else: #check if there is a previous unfinished turn if not turn_finish: turn = Turn(turn_text_pre.strip()) topic.add_turn(turn) #start the new turn turn_text_pre = turn_text else: #this is another section of a previous turn turn_text_pre = turn_text turn_finish = False if not turn_finish and len(turn_text_pre)> 0: topic.add_turn(Turn(turn_text_pre.strip())) # build parent-child relationship # turns in a topic is added in time order # after previous processing # some user use list mark up to indicate followup turns = topic.turns parent_stack = [] for turn in turns: turn_text = turn.text indent = list_reg_c.match(turn_text) #pdb.set_trace() if indent is None: # start of a new thread if len(parent_stack) > 0: del parent_stack[:] parent_stack.append(turn) else: indent_str = indent.group() level = len(indent_str) # intended level #pdb.set_trace() current = len(parent_stack)-1 # current level if current < 0: # this is the first turn parent_stack.append(turn) continue if level > current: parent = parent_stack[current] parent.add_sub(turn) turn.add_parent(parent) parent_stack.append(turn) # continue #if level == current: #same level,maybe a reply of previous one # pop the old one # parent_stack.pop() # parent = parent_stack[current] # parent.add_sub(turn) # turn.add_parent(parent) # parent_stack.append(turn) else: # this is a reply of previous turn diff = current - level + 1 for counter in range(diff): parent_stack.pop() parent = parent_stack[len(parent_stack)-1] parent.add_sub(turn) turn.add_parent(parent) parent_stack.append(turn)
# Test the regular expression to extract user and date from # the talk page import re import datetime import smart_date f = open('talk_archive') content = f.read() user_date_reg = '\[\[User(\s*talk)*:(?P<name>.*?)'\ '\|(?P<alias>.*?)\]\]'\ '[\s\|]*(\[\[.*\]\])*\s*'\ '(?P<date>[a-zA-Z0-9i/,:\-\s]+)' user_date_reg_c= re.compile(user_date_reg) result = user_date_reg_c.finditer(content) for match in result: user = match.group('name') date_str = match.group('date') print 'User: '******' Date: ' + date_str date = smart_date.str_to_datetime(date_str) if date is None: print 'Datetime is not in recognized format!' else: print date