def chunk_parse(self, grammar, no_blanks=True, incomplete='record', **kwargs): """ Returns an element tree structure corresponding to a toolbox data file parsed according to the chunk grammar. @type grammar: string @param grammar: Contains the chunking rules used to parse the database. See L{chunk.RegExp} for documentation. @type no_blanks: boolean @param no_blanks: blank fields that are not important to the structure are deleted @type kwargs: keyword arguments dictionary @param incomplete: name of element used if parse doesn't result in one toplevel element @rtype: string @param kwargs: Keyword arguments passed to L{toolbox.StandardFormat.fields()} @rtype: ElementTree._ElementInterface @return: Contents of toolbox data parsed according to the rules in grammar """ from nltk import chunk from nltk.parse import Tree cp = chunk.RegexpParser(grammar) db = self.parse(**kwargs) tb_etree = Element('toolbox_data') header = db.find('header') tb_etree.append(header) for record in db.findall('record'): parsed = cp.parse([(elem.text, elem.tag) for elem in record]) top = parsed[0] if not isinstance(top, Tree) or len(parsed) != 1: # didn't get a full parse parsed.node = incomplete top = parsed tb_etree.append(self._tree2etree(top, no_blanks)) return tb_etree
def _tree2etree(self, parent): from nltk.parse import Tree root = Element(parent.node) for child in parent: if isinstance(child, Tree): root.append(self._tree2etree(child)) else: text, tag = child e = SubElement(root, tag) e.text = text return root
def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None): """Return a string with a standard format representation of the toolbox data in tree (tree can be a toolbox database or a single record). @param tree: flat representation of toolbox data (whole database or single record) @type tree: C{ElementTree._ElementInterface} @param encoding: Name of an encoding to use. @type encoding: C{string} @param errors: Error handling scheme for codec. Same as the C{encode} inbuilt string method. @type errors: C{string} @param unicode_fields: @type unicode_fields: C{dictionary} or C{set} of field names @rtype: C{string} @return: C{string} using standard format markup """ if tree.tag == 'record': root = Element('toolbox_data') root.append(tree) tree = root if tree.tag != 'toolbox_data': raise ValueError, "not a toolbox_data element structure" if encoding is None and unicode_fields is not None: raise ValueError, \ "if encoding is not specified then neither should unicode_fields" l = [] for rec in tree: l.append('\n') for field in rec: mkr = field.tag value = field.text if encoding is not None: if unicode_fields is not None and mkr in unicode_fields: cur_encoding = 'utf8' else: cur_encoding = encoding if re.search(_is_value, value): l.append((u"\\%s %s\n" % (mkr, value)).encode( cur_encoding, errors)) else: l.append((u"\\%s%s\n" % (mkr, value)).encode( cur_encoding, errors)) else: if re.search(_is_value, value): l.append("\\%s %s\n" % (mkr, value)) else: l.append("\\%s%s\n" % (mkr, value)) return ''.join(l[1:])
def inline_char_coded_text(tag, s): """return an element with the char coded text converted to span elements with appropriate attributes @param tag: tag for returned element @type tag: C{String} @param s: element corresponding to an MDF field. This is modified by the function. It may already have 'span' subelements corresponding to character styled text earlier in the MDF field. @type s: C{String} @return: an element with the character code styles converted to spans elements. @rtype: C{ElementTree._ElementInterface} """ elem = Element(tag) elem.text= s inline_char_coded_elem(elem) return elem
def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None): """Return a string with a standard format representation of the toolbox data in tree (tree can be a toolbox database or a single record). @param tree: flat representation of toolbox data (whole database or single record) @type tree: C{ElementTree._ElementInterface} @param encoding: Name of an encoding to use. @type encoding: C{string} @param errors: Error handling scheme for codec. Same as the C{encode} inbuilt string method. @type errors: C{string} @param unicode_fields: @type unicode_fields: C{dictionary} or C{set} of field names @rtype: C{string} @return: C{string} using standard format markup """ if tree.tag == 'record': root = Element('toolbox_data') root.append(tree) tree = root if tree.tag != 'toolbox_data': raise ValueError, "not a toolbox_data element structure" if encoding is None and unicode_fields is not None: raise ValueError, \ "if encoding is not specified then neither should unicode_fields" l = [] for rec in tree: l.append('\n') for field in rec: mkr = field.tag value = field.text if encoding is not None: if unicode_fields is not None and mkr in unicode_fields: cur_encoding = 'utf8' else: cur_encoding = encoding if re.search(_is_value, value): l.append((u"\\%s %s\n" % (mkr, value)).encode(cur_encoding, errors)) else: l.append((u"\\%s%s\n" % (mkr, value)).encode(cur_encoding, errors)) else: if re.search(_is_value, value): l.append("\\%s %s\n" % (mkr, value)) else: l.append("\\%s%s\n" % (mkr, value)) return ''.join(l[1:])
def _chunk_parse(self, grammar=None, top_node='record', trace=0, **kwargs): """ Returns an element tree structure corresponding to a toolbox data file parsed according to the chunk grammar. @type grammar: C{string} @param grammar: Contains the chunking rules used to parse the database. See L{chunk.RegExp} for documentation. @type top_node: C{string} @param top_node: The node value that should be used for the top node of the chunk structure. @type trace: C{int} @param trace: The level of tracing that should be used when parsing a text. C{0} will generate no tracing output; C{1} will generate normal tracing output; and C{2} or higher will generate verbose tracing output. @type kwargs: C{dictionary} @param kwargs: Keyword arguments passed to L{toolbox.StandardFormat.fields()} @rtype: C{ElementTree._ElementInterface} @return: Contents of toolbox data parsed according to the rules in grammar """ from nltk import chunk from nltk.parse import Tree cp = chunk.RegexpParser(grammar, top_node=top_node, trace=trace) db = self.parse(**kwargs) tb_etree = Element('toolbox_data') header = db.find('header') tb_etree.append(header) for record in db.findall('record'): parsed = cp.parse([(elem.text, elem.tag) for elem in record]) tb_etree.append(self._tree2etree(parsed)) return tb_etree