def parse_transcript(self, filename): meeting = PlenumMeeting() meeting.save() transcript = Transcript() transcript.meeting = meeting transcript.save() matcher = Matcher() text_lines = [] ordinal = count() header = None speaker = None state = "cover" for line in codecs.open(filename, "r", "utf-8"): if state == "cover": if line.strip() == u"תוכן עניינים": cover = "".join(text_lines) block = TranscriptBlock(transcript = transcript, ordinal = ordinal.next(), body = cover) block.save() text_lines = [] state = "toc" elif re.search(ur"^<(?!הצע|החלט)", line): text_lines = [] state = "speaker" elif re.search(r"^ *<", line): text_lines = [] state = "header" elif matcher.search(ur"(הישיבה.*?)\s+$", line): meeting.title = matcher.group(1) print meeting.title
elif matcher.search(ur"(הישיבה.*?)\s+$", line): meeting.title = matcher.group(1) print meeting.title elif state == "toc": if re.search(ur"^<(?!הצע|החלט)", line): text_lines = [] state = "speaker" elif re.search(r"^ *<", line): text_lines = [] state = "header" elif state == "header": if re.search(ur"^<(?!הצע|החלט)", line): header = "".join(text_lines) block = TranscriptBlock(transcript = transcript, ordinal = ordinal.next(), header = header) block.save() header = None text_lines = [] state = "speaker" elif state == "speaker": speaker_line = "".join(text_lines).replace("\n", "") if matcher.search(r'^<(.*?):?>$', speaker_line): header = matcher.group(1) try: speaker = Person.objects.get(name__iexact=header) except Person.DoesNotExist: speaker = Person(name=header) speaker.save() text_lines = []