def parse_plays(self, filename): """ parses the plays found on http://sydney.edu.au/engineering/it/~matty/Shakespeare/ """ temp_char = "" remainder = [] empty_line = False start = False bar = ProgressBar(length=sum(1 for line in open(filename)), name="Parsing " + filename) with open(filename, "r") as f: for line in f: bar.update() if not start: if "ACT I" in line: start = True else: if not line or line.isspace(): empty_line = True else: if empty_line and self._is_next_character(line): tab = line.find('\t') temp_char = line[:tab] remainder = self._parse_play_line(line[tab + 1:], temp_char, []) elif temp_char and self._is_text(line): remainder = self._parse_play_line(line.strip(), temp_char, remainder) empty_line = False bar.done()
def source(self, url, **kwargs): soup = BeautifulSoup(request.urlopen(url)) lines = soup.get_text().splitlines() bar = ProgressBar(length=len(lines), name="Parsing "+url) for line in lines: bar.update() self._parse_line(line) bar.done()
def parse(self, inputfile, source, **kwargs): try: self.parser.source(inputfile, **kwargs) except TypeError: return False bar = ProgressBar(name="Processing " + inputfile, length=len(self.parser)) for sentence in self.parser.get_next(): sentence.text.insert(0, self.backend.SENTENCE_START) sentence.text.append(self.backend.SENTENCE_END) self.backend.put(sentence.text, source, sentence.char) bar.update() bar.done()