def convert_doc_to_latex(doc, verbatim_strings=[], use_gold_trees=False): """ This function expects a list of ccg_trees, and a list of tokens (as produced by transccg). Then, it converts each pair (ccg_tree, ccg_tokens) into a presentation MathML string, and wraps them with HTML code. verbatim_strings contains a list of strings that should be printed verbatim at the end of the HTML document, for debugging. """ ccg_trees = [] if use_gold_trees: for sentence in doc.xpath('//sentence'): gold_tree_index = int(sentence.get('gold_tree', '0')) ccg_trees.append(sentence.xpath('./ccg')[gold_tree_index]) ccg_trees = [build_ccg_tree(c) for c in ccg_trees] else: ccg_trees = [build_ccg_tree(c) for c in doc.xpath('//sentence/ccg[1]')] sem_trees = [build_ccg_tree(c) for c in doc.xpath('//semantics')] if not sem_trees: sem_trees = [None] * len(ccg_trees) tokens = doc.xpath('//tokens') assert len(ccg_trees) == len(tokens) num_hypotheses = len(ccg_trees) - 1 sentence_ids = [ "Premise {0}: ".format(i + 1) for i in range(num_hypotheses) ] sentence_ids.append("Conclusion: ") latex_str = "" for i in range(len(ccg_trees)): sentence_surface = ' '.join(tokens[i].xpath('token/@surf')) latex_str += "\n\n\\vspace{2em}\n\n\\noindent\n" + sentence_ids[i] + sentence_surface + "\n\n\medskip\n\n" \ + convert_node_to_latex(ccg_trees[i], sem_trees[i], tokens[i]) verbatim_text = "" if verbatim_strings: verbatim_text = "<p>Script piped to coq</p>" for vb_str in verbatim_strings: verbatim_text += "<pre>\n" + vb_str + "\n</pre>\n" html_str = "\documentclass{article}\n" \ + "\\usepackage{proof,lscape}\n" \ + "\\pagestyle{empty}\n" \ + "\\newcommand{\\rulelabelsize}{\\scriptsize}\n" \ + "\\newcommand{\\FA}{\\mbox{\\rulelabelsize $<$}}\n" \ + "\\newcommand{\\BA}{\\mbox{\\rulelabelsize $>$}}\n" \ + "\\newcommand{\\FC}{\\mbox{\\rulelabelsize $>\\!\\mathbf{B}$}}\n" \ + "\\newcommand{\\BC}{\\mbox{\\rulelabelsize $>\\!\\mathbf{B}$}}\n" \ + "\\newcommand{\\SR}[1]{\\begin{tabular}{c}$#1$\\end{tabular}}\n" \ + "\\begin{document}\n" \ + "\\small\n" \ html_str += "%\\begin{landscape}" html_str += latex_str html_str += verbatim_text html_str += "\n%\\end{landscape}" html_str += "\n\\end{document}" return html_str
def convert_doc_to_mathml(doc, use_gold_trees=False): """ This function expects an XML <document>, which is then converted into a presentation MathML string. """ num_sentences = int(doc.xpath('count(./sentences/sentence)')) mathml_str = "" for sent_ind, sentence in enumerate(doc.xpath('./sentences/sentence')): gold_tree_index = int(sentence.get('gold_tree', -1)) if sent_ind < num_sentences - 1: sentence_label = 'Premise {0}'.format(sent_ind) else: sentence_label = 'Conclusion' sentence_text = get_surf_from_xml_node(sentence) ccg_trees = sentence.xpath('./ccg') sem_trees = sentence.xpath('./semantics') tokens = sentence.xpath('./tokens') if not tokens: return mathml_str tokens = tokens[0] assert len(ccg_trees) >= len(sem_trees) for i in range(len(ccg_trees)): ccg_tree_id = ccg_trees[i].get('id', str(i)) try: ccg_tree = build_ccg_tree(ccg_trees[i]) except ValueError: mathml_str += "<p>{0}, tree {1}: {2}</p>\n".format( sentence_label, ccg_tree_id, sentence_text) \ + "<p>Syntactic parse error. Visualization skipped.</p>" continue if gold_tree_index == i: ccg_tree_id += " (gold)" sem_tree = None if i >= len(sem_trees) else sem_trees[i] if sem_tree is not None: sem_tree = build_ccg_tree(sem_tree) mathml_str += "<p>{0}, tree {1}: {2}</p>\n".format( sentence_label, ccg_tree_id, sentence_text) \ + "<math xmlns='http://www.w3.org/1998/Math/MathML'>\n" \ + convert_node_to_mathml(ccg_tree, sem_tree, tokens) \ + "</math>\n" verbatim_strings = doc.xpath( './proof/master_theorem/theorems/theorem/coq_script/text()') verbatim_text = "" if verbatim_strings: verbatim_text = "<p>Script piped to coq</p>" for vb_str in verbatim_strings: verbatim_text += "<pre>\n" + vb_str + "\n</pre>\n" doc_mathml_str = '{0}\n{1}'.format(mathml_str, verbatim_text) return doc_mathml_str
def test_NonterminalButpreterminal(self): sentence_str = r""" <sentence id="s1"> <tokens> <token surf="surf1" id="t1_1"/> <token surf="surf2" id="t1_2"/> </tokens> <ccg root="sp1-5"> <span terminal="t1_1" category="cat1" id="sp1-1"/> <span terminal="t1_2" category="cat2" id="sp1-2"/> <span child="sp1-1" rule="lex1" category="NP1" id="sp1-3"/> <span child="sp1-2" rule="lex2" category="NP2" id="sp1-4"/> <span child="sp1-3 sp1-4" rule="rr" category="NPP" id="sp1-5"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = sentence.find("ccg") ccg_root = build_ccg_tree(ccg_tree) tokens = sentence.find("tokens") attributes = get_attributes_from_ccg_node_recursively(ccg_root[1], tokens) expected_attributes = {'category' : 'NP2', 'rule' : 'lex2', 'id' : 'sp1-4', 'child' : 'sp1-2', 'child0_terminal' : 't1_2', 'child0_surf' : 'surf2', 'child0_category' : 'cat2', 'child0_id' : 'sp1-2'} self.assertEqual(len(expected_attributes), len(attributes), '\n{0}\nvs.\n{1}'.format(expected_attributes, attributes)) for k in expected_attributes: self.assertEqual(expected_attributes.get(k, None), attributes.get(k, None))
def test_nonterminal1(self): sentence_str = r""" <sentence id="s1"> <tokens> <token surf="surf1" id="t1_1"/> </tokens> <ccg root="sp1-3"> <span terminal="t1_1" category="cat1" id="sp1-1"/> <span child="sp1-1" rule="lex" category="NP" id="sp1-2"/> <span child="sp1-2" rule="rr" category="NPP" id="sp1-3"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = sentence.find("ccg") ccg_root = build_ccg_tree(ccg_tree) tokens = sentence.find("tokens") attributes = get_attributes_from_ccg_node_recursively(ccg_root, tokens) expected_attributes = {'category' : 'NPP', 'rule' : 'rr', 'id' : 'sp1-3', 'child' : 'sp1-2', 'child0_category' : 'NP', 'child0_rule' : 'lex', 'child0_id' : 'sp1-2', 'child0_child' : 'sp1-1', 'child0_child0_terminal' : 't1_1', 'child0_child0_surf' : 'surf1', 'child0_child0_category' : 'cat1', 'child0_child0_id' : 'sp1-1'} self.assertEqual(len(expected_attributes), len(attributes)) for k in expected_attributes: self.assertEqual(expected_attributes.get(k, None), attributes.get(k, None))
def test_terminal(self): sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> </tokens> <ccg root="sp1-1"> <span terminal="t1_1" category="cat1" end="2" begin="1" id="sp1-1"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = sentence.find("ccg") ccg_root = build_ccg_tree(ccg_tree) tokens = sentence.find("tokens") attributes = get_attributes_from_ccg_node_recursively(ccg_root, tokens) expected_attributes = {'terminal' : 't1_1', 'category' : 'cat1', 'end' : '2', 'begin' : '1', 'id' : 'sp1-1', 'base' : 'base1', 'pos' : 'pos1', 'surf' : 'surf1'} self.assertEqual(len(expected_attributes), len(attributes)) for k in expected_attributes: self.assertEqual(expected_attributes[k], attributes[k])
def test_preterminal(self): sentence_str = r""" <sentence id="s1"> <tokens> <token surf="surf1" id="t1_1"/> </tokens> <ccg root="sp1-2"> <span terminal="t1_1" category="cat1" id="sp1-1"/> <span child="sp1-1" rule="lex" category="NP" id="sp1-2"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = sentence.find("ccg") ccg_root = build_ccg_tree(ccg_tree) tokens = sentence.find("tokens") attributes = get_attributes_from_ccg_node_recursively(ccg_root, tokens) expected_attributes = { 'category': 'NP', 'rule': 'lex', 'id': 'sp1-2', 'child': 'sp1-1', 'child0_terminal': 't1_1', 'child0_surf': 'surf1', 'child0_category': 'cat1', 'child0_id': 'sp1-1' } self.assertEqual(len(expected_attributes), len(attributes)) for k in expected_attributes: self.assertEqual(expected_attributes.get(k, None), attributes.get(k, None))
def test_terminal(self): sentence_str = r""" <sentence id="s1"> <tokens> <token base="base1" pos="pos1" surf="surf1" id="t1_1"/> </tokens> <ccg root="sp1-1"> <span terminal="t1_1" category="cat1" end="2" begin="1" id="sp1-1"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = sentence.find("ccg") ccg_root = build_ccg_tree(ccg_tree) tokens = sentence.find("tokens") attributes = get_attributes_from_ccg_node_recursively(ccg_root, tokens) expected_attributes = { 'terminal': 't1_1', 'category': 'cat1', 'end': '2', 'begin': '1', 'id': 'sp1-1', 'base': 'base1', 'pos': 'pos1', 'surf': 'surf1' } self.assertEqual(len(expected_attributes), len(attributes)) for k in expected_attributes: self.assertEqual(expected_attributes[k], attributes[k])
def test_nonterminal2(self): sentence_str = r""" <sentence id="s1"> <tokens> <token surf="surf1" id="t1_1"/> <token surf="surf2" id="t1_2"/> </tokens> <ccg root="sp1-5"> <span terminal="t1_1" category="cat1" id="sp1-1"/> <span terminal="t1_2" category="cat2" id="sp1-2"/> <span child="sp1-1" rule="lex1" category="NP1" id="sp1-3"/> <span child="sp1-2" rule="lex2" category="NP2" id="sp1-4"/> <span child="sp1-3 sp1-4" rule="rr" category="NPP" id="sp1-5"/> </ccg> </sentence> """ sentence = etree.fromstring(sentence_str) ccg_tree = sentence.find("ccg") ccg_root = build_ccg_tree(ccg_tree) tokens = sentence.find("tokens") attributes = get_attributes_from_ccg_node_recursively(ccg_root, tokens) expected_attributes = { 'category': 'NPP', 'rule': 'rr', 'id': 'sp1-5', 'child': 'sp1-3 sp1-4', 'child0_category': 'NP1', 'child0_rule': 'lex1', 'child0_id': 'sp1-3', 'child0_child': 'sp1-1', 'child0_child0_terminal': 't1_1', 'child0_child0_surf': 'surf1', 'child0_child0_category': 'cat1', 'child0_child0_id': 'sp1-1', 'child1_category': 'NP2', 'child1_rule': 'lex2', 'child1_id': 'sp1-4', 'child1_child': 'sp1-2', 'child1_child0_terminal': 't1_2', 'child1_child0_surf': 'surf2', 'child1_child0_category': 'cat2', 'child1_child0_id': 'sp1-2' } self.assertEqual( len(expected_attributes), len(attributes), '\n{0}\nvs.\n{1}'.format(expected_attributes, attributes)) for k in expected_attributes: self.assertEqual(expected_attributes.get(k, None), attributes.get(k, None))
def convert_vertical_to_mathml(doc, verbatim_strings=[], use_gold_trees=False): """ This function expects a list of ccg_trees, and a list of tokens (as produced by transccg). Then, it converts each pair (ccg_tree, ccg_tokens) into a presentation MathML string, and wraps them with HTML code. verbatim_strings contains a list of strings that should be printed verbatim at the end of the HTML document, for debugging. """ ccg_trees = [] if use_gold_trees: for sentence in doc.xpath('//sentence'): gold_tree_index = int(sentence.get('gold_tree', '0')) ccg_trees.append(sentence.xpath('./ccg')[gold_tree_index]) ccg_trees = [build_ccg_tree(c) for c in ccg_trees] else: ccg_trees = [build_ccg_tree(c) for c in doc.xpath('//sentence/ccg[1]')] sem_trees = [build_ccg_tree(c) for c in doc.xpath('//semantics')] if not sem_trees: sem_trees = [None] * len(ccg_trees) tokens = doc.xpath('//tokens') assert len(ccg_trees) == len(tokens) num_hypotheses = len(ccg_trees) - 1 sentence_ids = [ "Premise {0}: ".format(i + 1) for i in range(num_hypotheses) ] sentence_ids.append("Conclusion: ") mathml_str = "" for i in range(len(ccg_trees)): sentence_surface = ' '.join(tokens[i].xpath('token/@surf')) mathml_str += "<p>" + sentence_ids[i] + sentence_surface + "</p>\n" \ + "<math xmlns='http://www.w3.org/1998/Math/MathML'>" \ + convert_node_to_mathml(ccg_trees[i], sem_trees[i], tokens[i]) \ + "</math>" verbatim_text = "" if verbatim_strings: verbatim_text = "<p>Script piped to coq</p>" for vb_str in verbatim_strings: verbatim_text += "<pre>\n" + vb_str + "\n</pre>\n" html_str = """\ <!doctype html> <html lang='en'> <head> <meta charset='UTF-8'> <title>CCG to Lambda conversion</title> <style> body { font-size: 1em; } </style> <script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"> </script> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ tex2jax: { inlineMath: [['$','$'], ['\\(','\\)']], processEscapes: true }, CommonHTML: { matchFontHeight: false }, displayAlign: "left", displayIndent: "2em" }); MathJax.Hub.Config({ "HTML-CSS": { availableFonts: [], preferredFont: null, webFont: "Neo-Euler" } }); </script> </head> <body> """ html_str += mathml_str html_str += verbatim_text html_str += """\ </body> </html> """ return html_str
def convert_doc_to_mathml_(doc, verbatim_strings=[], use_gold_trees=False): """ This function expects a list of ccg_trees, and a list of tokens (as produced by transccg). Then, it converts each pair (ccg_tree, ccg_tokens) into a presentation MathML string, and wraps them with HTML code. verbatim_strings contains a list of strings that should be printed verbatim at the end of the HTML document, for debugging. """ ccg_trees = [] if use_gold_trees: for sentence in doc.xpath('./sentences/sentence'): gold_tree_index = int(sentence.get('gold_tree', '0')) ccg_trees.append(sentence.xpath('./ccg')[gold_tree_index]) ccg_trees = [build_ccg_tree(c) for c in ccg_trees] else: ccg_trees = [ build_ccg_tree(c) for c in doc.xpath('./sentences/sentence/ccg[1]') ] sem_trees = [ build_ccg_tree(c) for c in doc.xpath('./sentences/sentence/semantics') ] if not sem_trees: sem_trees = [None] * len(ccg_trees) tokens = doc.xpath('./sentences/sentence/tokens') assert len(ccg_trees) == len(tokens) num_hypotheses = len(ccg_trees) - 1 sentence_ids = [ "Premise {0}: ".format(i + 1) for i in range(num_hypotheses) ] sentence_ids.append("Conclusion: ") mathml_str = "" for i in range(len(ccg_trees)): sentence_surface = ' '.join(tokens[i].xpath('token/@surf')) mathml_str += "<p>" + sentence_ids[i] + sentence_surface + "</p>\n" \ + "<math xmlns='http://www.w3.org/1998/Math/MathML'>" \ + convert_node_to_mathml(ccg_trees[i], sem_trees[i], tokens[i]) \ + "</math>" verbatim_text = "" if verbatim_strings: verbatim_text = "<p>Script piped to coq</p>" for vb_str in verbatim_strings: verbatim_text += "<pre>\n" + vb_str + "\n</pre>\n" html_str = """\ <!doctype html> <html lang='en'> <head> <style> body { font-size: 1em; } </style> <meta charset='UTF-8'> <title>CCG to Lambda conversion</title> <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"> </script> </head> <body> """ html_str += mathml_str html_str += verbatim_text html_str += """\ </body> </html> """ return cgi.escape(html_str)
def convert_root_to_mathml(root, verbatim_strings=[], use_gold_trees=False): """ This function expects an XML root. Then, it converts each document doc into a presentation MathML string, and wraps them with HTML code. verbatim_strings is a list of strings that should be printed verbatim at the end of the HTML document, for debugging. """ doc_mathml_strs = [] for doc_ind, doc in enumerate(root.xpath('./document')): doc_id = doc.get('id', doc_ind) num_sentences = int(doc.xpath('count(./sentences/sentence)')) mathml_str = "" for sent_ind, sentence in enumerate(doc.xpath('./sentences/sentence')): gold_tree_index = int(sentence.get('gold_tree', -1)) if sent_ind < num_sentences - 1: sentence_label = 'Premise {0}'.format(sent_ind) else: sentence_label = 'Conclusion' sentence_text = get_surf_from_xml_node(sentence) ccg_trees = sentence.xpath('./ccg') sem_trees = sentence.xpath('./semantics') tokens = sentence.xpath('./tokens')[0] assert len(ccg_trees) >= len(sem_trees) for i in range(len(ccg_trees)): ccg_tree_id = ccg_trees[i].get('id', str(i)) ccg_tree = build_ccg_tree(ccg_trees[i]) if gold_tree_index == i: ccg_tree_id += " (gold)" sem_tree = None if i >= len(sem_trees) else sem_trees[i] if sem_tree is not None: sem_tree = build_ccg_tree(sem_tree) mathml_str += "<p>{0}, tree {1}: {2}</p>\n".format( sentence_label, ccg_tree_id, sentence_text) \ + "<math xmlns='http://www.w3.org/1998/Math/MathML'>\n" \ + convert_node_to_mathml(ccg_tree, sem_tree, tokens) \ + "</math>\n" verbatim_text = "" if verbatim_strings: verbatim_text = "<p>Script piped to coq</p>" for vb_str in verbatim_strings: verbatim_text += "<pre>\n" + vb_str + "\n</pre>\n" doc_mathml_str = '{0}\n{1}'.format(mathml_str, verbatim_text) doc_mathml_strs.append(doc_mathml_str) html_str = """\ <!doctype html> <html lang='en'> <head> <meta charset='UTF-8'> <title>CCG to Lambda conversion</title> <style> body { font-size: 1em; } </style> <script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"> </script> </head> <body> """ html_str += '\n'.join([s for s in doc_mathml_strs]) html_str += """\ </body> </html> """ return html_str
def convert_doc_to_mathml(doc, verbatim_strings = [], use_gold_trees=False): """ This function expects a list of ccg_trees, and a list of tokens (as produced by transccg). Then, it converts each pair (ccg_tree, ccg_tokens) into a presentation MathML string, and wraps them with HTML code. verbatim_strings contains a list of strings that should be printed verbatim at the end of the HTML document, for debugging. """ ccg_trees = [] if use_gold_trees: for sentence in doc.xpath('//sentence'): gold_tree_index = int(sentence.get('gold_tree', '0')) ccg_trees.append(sentence.xpath('./ccg')[gold_tree_index]) ccg_trees = [build_ccg_tree(c) for c in ccg_trees] else: ccg_trees = [build_ccg_tree(c) for c in doc.xpath('//sentence/ccg[1]')] sem_trees = [build_ccg_tree(c) for c in doc.xpath('//semantics')] if not sem_trees: sem_trees = [None] * len(ccg_trees) tokens = doc.xpath('//tokens') assert len(ccg_trees) == len(tokens) num_hypotheses = len(ccg_trees) - 1 sentence_ids = ["Premise {0}: ".format(i + 1) for i in range(num_hypotheses)] sentence_ids.append("Conclusion: ") mathml_str = "" for i in range(len(ccg_trees)): sentence_surface = ' '.join(tokens[i].xpath('token/@surf')) mathml_str += "<p>" + sentence_ids[i] + sentence_surface + "</p>\n" \ + "<math xmlns='http://www.w3.org/1998/Math/MathML'>" \ + convert_node_to_mathml(ccg_trees[i], sem_trees[i], tokens[i]) \ + "</math>" verbatim_text = "" if verbatim_strings: verbatim_text = "<p>Script piped to coq</p>" for vb_str in verbatim_strings: verbatim_text += "<pre>\n" + vb_str + "\n</pre>\n" html_str = """\ <!doctype html> <html lang='en'> <head> <meta charset='UTF-8'> <title>CCG to Lambda conversion</title> <style> body { font-size: 1em; } </style> <script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"> </script> </head> <body> """ html_str += mathml_str html_str += verbatim_text html_str += """\ </body> </html> """ return html_str