def parse_section_of_a_file(inobj, outobj, datadir, start=None, end=None, convert_from_trees=False, keep_temporary_section=False, **parse_args): """Generic function to parse just a portion of a file and put the outobj into a file. This function is the workhorse of ParseHog.parse. inobj and outobj are either file-like objects or filenames. start and end can be None to mean the beginning or end of the file, respectively.""" # note that None is less than all integers and that the empty string # is greater than all integers. thus, if we change end to "" if it is # None, we can compare it with integers properly. if end is None: end = "" input_file = open_file_or_filename(inobj) if convert_from_trees: input_file = TreeConverter(input_file) parse_args['already_tokenized'] = True # set -K sliced_file = keepable_tempfile(keep=keep_temporary_section) current_line_number = 0 for line in input_file: if start <= current_line_number <= end: sliced_file.write(line) elif current_line_number > end: break current_line_number += 1 sliced_file.flush() if (start is None) or (end is ""): num_input_lines = None else: num_input_lines = (start - end) + 1 parser = ECParser() parser.parse_sgml_file(sliced_file, output=outobj, datadir=datadir, **parse_args)
def parse_sgml_file(self, sgml_fileobj_or_name, output=None, debug=False, skip_blank_lines=False, logstream=sys.stdout, error_messages_are_failures=True, **parse_args): # TODO these docs are dangerously outdated """sgml_fileobj_or_name is an <s> file object, must have a filename (i.e. tempfile.TemporaryFile is not okay but a tempfile.NamedTemporaryFile is fine). We run Eugene's parser on the sgml_fileobj_or_name and return the output of his parser as a string. See get_parser_command() for a description of parse_args.""" # TODO don't we already have something like this? if isinstance(sgml_fileobj_or_name, basestring): name = sgml_fileobj_or_name else: name = sgml_fileobj_or_name.name cmd = get_parser_command(name, **parse_args) if debug: print >>logstream, "ECParser: Running %r" % cmd if output: print >>logstream, "ECParser: Redirecting output to %r" % output logstream.flush() # fork and run the parser as a child. Popen3 objects will give us # their stdin/stdout/stderr file handles and PID. self.parser = Popen3(cmd, capturestderr=True) self.pid = self.parser.pid if output: # redirect output output_file = open_file_or_filename(output, 'w') data = self.parser.fromchild.read() if skip_blank_lines: lines = data.splitlines() for line in lines: if line.strip(): output_file.write(data) else: output_file.write(data) output_file.flush() self.last_output = "(redirected to %r)" % output """ # this code doesn't handle nbest or LM yet, so I'm taking it out # until it is more complete # make sure we saw enough output lines or raise MissingScores if length_hint is not None and parse_args.get('nbest') == None: output_file = file(output, 'r') num_output_lines = 0 for line in output_file: num_output_lines += 1 num_input_lines = length_hint # 2x since there is a blank line between if num_output_lines != 2 * num_input_lines: m = "Expected %d lines of output in %s, only found %d" v = (2 * num_input_lines, output, num_output_lines) raise MissingScores(m % v) """ else: self.last_output = self.parser.fromchild.read() self.last_error = self.parser.childerr.read().strip() if self.last_error.strip(): print >>logstream, "Parser stderr -----" print >>logstream, self.last_error print >>logstream, "End parser stderr -----" logstream.flush() if error_messages_are_failures: raise ParserPrintedErrorMessages(self.last_error) self.last_status = self.parser.wait() self.pid = None if self.last_status != 0 or debug: print >>logstream, "Parser stdout -----" print self.last_output print >>logstream, "End parser stdout -----" logstream.flush() if self.last_status != 0: raise BadParserExitCode(self.last_status) return self.last_output