def parse_string(self, text): """Parses a text string and returns a SourceDoc. Simply dumps the full string into the text variable of the SourceDoc.""" sourcedoc = SourceDoc(None) # TODO: do we need to ensure the text is unicode? sourcedoc.text = text return TarsqiDocument(sourcedoc, {})
def parse_file(self, filename, tarsqidoc): """Parses filename and returns a SourceDoc. Uses the ParseFile routine of the expat parser, where all the handlers are set up to fill in the text and tags in SourceDoc.""" self.sourcedoc = SourceDoc(filename) # TODO: should this be codecs.open() for non-ascii? self.parser.ParseFile(open(filename)) self.sourcedoc.finish() tarsqidoc.sourcedoc = self.sourcedoc
def parse_string(self, text, tarsqidoc): """Parses a text string and returns a SourceDoc. Uses the ParseFile routine of the expat parser, where all the handlers are set up to fill in the text and tags in SourceDoc.""" self.sourcedoc = SourceDoc(None) # TODO: do we need to make sure that text is unicode? self.parser.Parse(text) self.sourcedoc.finish() tarsqidoc.sourcedoc = self.sourcedoc
def parse_file(self, filename, tarsqidoc): """Parses filename and returns a SourceDoc. Uses the ParseFile routine of the expat parser, where all the handlers are set up to fill in the text and tags in SourceDoc.""" self.sourcedoc = SourceDoc(filename) # TODO: should this be codecs.open() for non-ascii? # self.parser.ParseFile(open(filename)) # NOTE: actually, the above line needed to replaced with the following # while preparing to port code to Python3. content = codecs.open(filename).read() self.parser.Parse(content) self.sourcedoc.finish() tarsqidoc.sourcedoc = self.sourcedoc
def parse_string(self, text, tarsqidoc): """Parse the TTK string and put the contents in the appropriate parts of the SourceDoc.""" self.lif = LIF(json_string=text) tarsqidoc.sourcedoc = SourceDoc() tarsqidoc.sourcedoc.text = self.lif.text.value tarsqidoc.sourcedoc.lif = self.lif
def _parse(self, tarsqidoc): self.sourcedoc = SourceDoc(None) self.tarsqidoc = tarsqidoc self.tarsqidoc.sourcedoc = self.sourcedoc self.sourcedoc.text = self.topnodes['text'].firstChild.data self._add_source_tags() self._add_tarsqi_tags() self._add_comments() self._add_metadata()
def parse_file(self, filename): """Parse the TTK file and put the contents in the appropriate parts of the SourceDoc.""" self._load_dom(filename) self.sourcedoc = SourceDoc(filename) self.tarsqidoc = TarsqiDocument(self.sourcedoc, {}) self.sourcedoc.text = self.topnodes['text'].firstChild.data self._add_source_tags() self._add_tarsqi_tags() self._add_comments() self._add_metadata() return self.tarsqidoc
def parse_file(self, filename, tarsqidoc): """Parse the TTK file and put the contents in the appropriate parts of the SourceDoc.""" if self.is_container(filename): self.container = Container(json_file=filename) self.lif = self.container.payload else: self.container = None self.lif = LIF(json_file=filename) tarsqidoc.sourcedoc = SourceDoc(filename) tarsqidoc.sourcedoc.text = self.lif.text.value tarsqidoc.sourcedoc.lif = self.lif tarsqidoc.sourcedoc.lif_container = self.container
def parse_file(self, filename): """Parses filename and returns a SourceDoc. Simply dumps the full file content into the text variable of the SourceDoc.""" sourcedoc = SourceDoc(filename) sourcedoc.text = codecs.open(filename, encoding='utf8').read() return TarsqiDocument(sourcedoc, {})
class SourceParserXML(SourceParser): """Simple XML parser, using the Expat parser. Instance variables encoding - a string sourcedoc - an instance of SourceDoc parser - an Expat parser """ # TODO: may need to add other handlers for completeness, see # http://docs.python.org/library/pyexpat.html, note however that if we # change our notion of primary data than we may not need to do that. # TODO. The way this is set up now requires the SourceDoc to know a lot # about the internal workings of the Expat parser (for example the notion # that begin and end tags are found separately). It is probably better to # keep that knowledge here, by building lists of tags here and only export # them after all elements are gathered (see note in parse_file). def __init__(self, encoding='utf-8'): """Set up the Expat parser.""" self.encoding = encoding self.sourcedoc = None self.parser = xml.parsers.expat.ParserCreate(encoding=encoding) self.parser.buffer_text = 1 self.parser.XmlDeclHandler = self._handle_xmldecl self.parser.ProcessingInstructionHandler = \ self._handle_processing_instruction self.parser.CommentHandler = self._handle_comment self.parser.StartElementHandler = self._handle_start self.parser.EndElementHandler = self._handle_end self.parser.CharacterDataHandler = self._handle_characters self.parser.DefaultHandler = self._handle_default def parse_file(self, filename): """Parses filename and returns a SourceDoc. Uses the ParseFile routine of the expat parser, where all the handlers are set up to fill in the text and tags in SourceDoc.""" self.sourcedoc = SourceDoc(filename) # TODO: should this be codecs.open() for non-ascii? self.parser.ParseFile(open(filename)) self.sourcedoc.finish() tarsqidoc = TarsqiDocument(self.sourcedoc, {}) return tarsqidoc def parse_string(self, text): """Parses a text string and returns a SourceDoc. Uses the ParseFile routine of the expat parser, where all the handlers are set up to fill in the text and tags in SourceDoc.""" self.sourcedoc = SourceDoc(None) # TODO: do we need to make sure that text is unicode? self.parser.Parse(text) self.sourcedoc.finish() tarsqidoc = TarsqiDocument(self.sourcedoc, {}) return tarsqidoc def _handle_xmldecl(self, version, encoding, standalone): """Store the XML declaration.""" self._debug('xmldec') self.sourcedoc.xmldecl = (version, encoding, standalone) def _handle_processing_instruction(self, target, data): """Store processing instructions""" self._debug('proc', target, len(data)) self.sourcedoc.add_processing_instruction(target, data) def _handle_comment(self, data): """Store comments.""" self._debug('comment', len(data)) self.sourcedoc.add_comment(data) def _handle_start(self, name, attrs): """Handle opening tags. Takes two arguments: a tag name and a dictionary of attributes. Asks the SourceDoc instance in the sourcedoc variable to add an opening tag.""" self._debug('start', name, attrs) self.sourcedoc.add_opening_tag(name, attrs) def _handle_end(self, name): """Add closing tags to the SourceDoc.""" self._debug('end', name) self.sourcedoc.add_closing_tag(name) def _handle_characters(self, string): """Handle character data by asking the SourceDocument to add the data. This will not necesarily add a contiguous string of character data as one data element. This should include ingnorable whtespace, but see the comment in the method below, I apparantly had reason t think otherwise.""" self._debug('chars', len(string), string) self.sourcedoc.add_characters(string) def _handle_default(self, string): """Handle default data by asking the SourceDoc to add it as characters. This is here to get the 'ignoreable' whitespace, which I do not want to ignore.""" # TODO: maybe ignore that whitespace after all, it does not seem to # matter though self._debug('default', len(string), string) self.sourcedoc.add_characters(string) def _debug(self, *rest): if SourceParser.DEBUG: p1 = "%s-%s" % (self.parser.CurrentLineNumber, self.parser.CurrentColumnNumber) p2 = "%s" % self.parser.CurrentByteIndex print("%-5s %-4s %s" % (p1, p2, " ".join(["%-8s" % replace_newline(x) for x in rest])))
def parse_string(self, text, tarsqidoc): """Parses a text string and returns a SourceDoc. Simply dumps the full string into the text variable of the SourceDoc.""" tarsqidoc.sourcedoc = SourceDoc(None) # TODO: do we need to ensure the text is unicode? tarsqidoc.sourcedoc.text = text