Пример #1
0
def convert_doc_to_latex(doc, verbatim_strings=[], use_gold_trees=False):
    """
    This function expects a list of ccg_trees, and a list of tokens
    (as produced by transccg). Then, it converts each pair (ccg_tree, ccg_tokens)
    into a presentation MathML string, and wraps them with HTML code.
    verbatim_strings contains a list of strings that should be printed
    verbatim at the end of the HTML document, for debugging.
    """
    ccg_trees = []
    if use_gold_trees:
        for sentence in doc.xpath('//sentence'):
            gold_tree_index = int(sentence.get('gold_tree', '0'))
            ccg_trees.append(sentence.xpath('./ccg')[gold_tree_index])
        ccg_trees = [build_ccg_tree(c) for c in ccg_trees]
    else:
        ccg_trees = [build_ccg_tree(c) for c in doc.xpath('//sentence/ccg[1]')]
    sem_trees = [build_ccg_tree(c) for c in doc.xpath('//semantics')]
    if not sem_trees:
        sem_trees = [None] * len(ccg_trees)
    tokens = doc.xpath('//tokens')
    assert len(ccg_trees) == len(tokens)
    num_hypotheses = len(ccg_trees) - 1
    sentence_ids = [
        "Premise {0}: ".format(i + 1) for i in range(num_hypotheses)
    ]
    sentence_ids.append("Conclusion: ")
    latex_str = ""
    for i in range(len(ccg_trees)):
        sentence_surface = ' '.join(tokens[i].xpath('token/@surf'))
        latex_str += "\n\n\\vspace{2em}\n\n\\noindent\n" + sentence_ids[i] + sentence_surface + "\n\n\medskip\n\n" \
                    + convert_node_to_latex(ccg_trees[i], sem_trees[i], tokens[i])

    verbatim_text = ""
    if verbatim_strings:
        verbatim_text = "<p>Script piped to coq</p>"
        for vb_str in verbatim_strings:
            verbatim_text += "<pre>\n" + vb_str + "\n</pre>\n"

    html_str = "\documentclass{article}\n" \
             + "\\usepackage{proof,lscape}\n" \
             + "\\pagestyle{empty}\n" \
             + "\\newcommand{\\rulelabelsize}{\\scriptsize}\n" \
             + "\\newcommand{\\FA}{\\mbox{\\rulelabelsize $<$}}\n" \
             + "\\newcommand{\\BA}{\\mbox{\\rulelabelsize $>$}}\n" \
             + "\\newcommand{\\FC}{\\mbox{\\rulelabelsize $>\\!\\mathbf{B}$}}\n" \
             + "\\newcommand{\\BC}{\\mbox{\\rulelabelsize $>\\!\\mathbf{B}$}}\n" \
             + "\\newcommand{\\SR}[1]{\\begin{tabular}{c}$#1$\\end{tabular}}\n" \
             + "\\begin{document}\n" \
             + "\\small\n" \

    html_str += "%\\begin{landscape}"
    html_str += latex_str
    html_str += verbatim_text
    html_str += "\n%\\end{landscape}"
    html_str += "\n\\end{document}"
    return html_str
Пример #2
0
def convert_doc_to_mathml(doc, use_gold_trees=False):
    """
    This function expects an XML <document>, which is then converted
    into a presentation MathML string.
    """
    num_sentences = int(doc.xpath('count(./sentences/sentence)'))
    mathml_str = ""
    for sent_ind, sentence in enumerate(doc.xpath('./sentences/sentence')):
        gold_tree_index = int(sentence.get('gold_tree', -1))
        if sent_ind < num_sentences - 1:
            sentence_label = 'Premise {0}'.format(sent_ind)
        else:
            sentence_label = 'Conclusion'
        sentence_text = get_surf_from_xml_node(sentence)
        ccg_trees = sentence.xpath('./ccg')
        sem_trees = sentence.xpath('./semantics')
        tokens = sentence.xpath('./tokens')
        if not tokens:
            return mathml_str
        tokens = tokens[0]
        assert len(ccg_trees) >= len(sem_trees)
        for i in range(len(ccg_trees)):
            ccg_tree_id = ccg_trees[i].get('id', str(i))
            try:
                ccg_tree = build_ccg_tree(ccg_trees[i])
            except ValueError:
                mathml_str += "<p>{0}, tree {1}: {2}</p>\n".format(
                                sentence_label, ccg_tree_id, sentence_text) \
                            + "<p>Syntactic parse error. Visualization skipped.</p>"
                continue
            if gold_tree_index == i:
                ccg_tree_id += " (gold)"
            sem_tree = None if i >= len(sem_trees) else sem_trees[i]
            if sem_tree is not None:
                sem_tree = build_ccg_tree(sem_tree)
            mathml_str += "<p>{0}, tree {1}: {2}</p>\n".format(
                            sentence_label, ccg_tree_id, sentence_text) \
                        + "<math xmlns='http://www.w3.org/1998/Math/MathML'>\n" \
                        + convert_node_to_mathml(ccg_tree, sem_tree, tokens) \
                        + "</math>\n"
    verbatim_strings = doc.xpath(
        './proof/master_theorem/theorems/theorem/coq_script/text()')
    verbatim_text = ""
    if verbatim_strings:
        verbatim_text = "<p>Script piped to coq</p>"
        for vb_str in verbatim_strings:
            verbatim_text += "<pre>\n" + vb_str + "\n</pre>\n"
    doc_mathml_str = '{0}\n{1}'.format(mathml_str, verbatim_text)
    return doc_mathml_str
Пример #3
0
 def test_NonterminalButpreterminal(self):
     sentence_str = r"""
   <sentence id="s1">
     <tokens>
       <token surf="surf1" id="t1_1"/>
       <token surf="surf2" id="t1_2"/>
     </tokens>
     <ccg root="sp1-5">
       <span terminal="t1_1" category="cat1" id="sp1-1"/>
       <span terminal="t1_2" category="cat2" id="sp1-2"/>
       <span child="sp1-1" rule="lex1" category="NP1" id="sp1-3"/>
       <span child="sp1-2" rule="lex2" category="NP2" id="sp1-4"/>
       <span child="sp1-3 sp1-4" rule="rr" category="NPP" id="sp1-5"/>
     </ccg>
   </sentence>
 """
     sentence = etree.fromstring(sentence_str)
     ccg_tree = sentence.find("ccg")
     ccg_root = build_ccg_tree(ccg_tree)
     tokens = sentence.find("tokens")
     attributes = get_attributes_from_ccg_node_recursively(ccg_root[1], tokens)
     expected_attributes = {'category' : 'NP2',
                            'rule' : 'lex2',
                            'id' : 'sp1-4',
                            'child' : 'sp1-2',
                            'child0_terminal' : 't1_2',
                            'child0_surf' : 'surf2',
                            'child0_category' : 'cat2',
                            'child0_id' : 'sp1-2'}
     self.assertEqual(len(expected_attributes), len(attributes),
                      '\n{0}\nvs.\n{1}'.format(expected_attributes, attributes))
     for k in expected_attributes:
         self.assertEqual(expected_attributes.get(k, None), attributes.get(k, None))
Пример #4
0
 def test_nonterminal1(self):
     sentence_str = r"""
   <sentence id="s1">
     <tokens>
       <token surf="surf1" id="t1_1"/>
     </tokens>
     <ccg root="sp1-3">
       <span terminal="t1_1" category="cat1" id="sp1-1"/>
       <span child="sp1-1" rule="lex" category="NP" id="sp1-2"/>
       <span child="sp1-2" rule="rr" category="NPP" id="sp1-3"/>
     </ccg>
   </sentence>
 """
     sentence = etree.fromstring(sentence_str)
     ccg_tree = sentence.find("ccg")
     ccg_root = build_ccg_tree(ccg_tree)
     tokens = sentence.find("tokens")
     attributes = get_attributes_from_ccg_node_recursively(ccg_root, tokens)
     expected_attributes = {'category' : 'NPP',
                            'rule' : 'rr',
                            'id' : 'sp1-3',
                            'child' : 'sp1-2',
                            'child0_category' : 'NP',
                            'child0_rule' : 'lex',
                            'child0_id' : 'sp1-2',
                            'child0_child' : 'sp1-1',
                            'child0_child0_terminal' : 't1_1',
                            'child0_child0_surf' : 'surf1',
                            'child0_child0_category' : 'cat1',
                            'child0_child0_id' : 'sp1-1'}
     self.assertEqual(len(expected_attributes), len(attributes))
     for k in expected_attributes:
         self.assertEqual(expected_attributes.get(k, None), attributes.get(k, None))
Пример #5
0
 def test_terminal(self):
     sentence_str = r"""
   <sentence id="s1">
     <tokens>
       <token base="base1" pos="pos1" surf="surf1" id="t1_1"/>
     </tokens>
     <ccg root="sp1-1">
       <span terminal="t1_1" category="cat1" end="2" begin="1" id="sp1-1"/>
     </ccg>
   </sentence>
 """
     sentence = etree.fromstring(sentence_str)
     ccg_tree = sentence.find("ccg")
     ccg_root = build_ccg_tree(ccg_tree)
     tokens = sentence.find("tokens")
     attributes = get_attributes_from_ccg_node_recursively(ccg_root, tokens)
     expected_attributes = {'terminal' : 't1_1',
                            'category' : 'cat1',
                            'end' : '2',
                            'begin' : '1',
                            'id' : 'sp1-1',
                            'base' : 'base1',
                            'pos' : 'pos1',
                            'surf' : 'surf1'}
     self.assertEqual(len(expected_attributes), len(attributes))
     for k in expected_attributes:
         self.assertEqual(expected_attributes[k], attributes[k])
Пример #6
0
 def test_preterminal(self):
     sentence_str = r"""
   <sentence id="s1">
     <tokens>
       <token surf="surf1" id="t1_1"/>
     </tokens>
     <ccg root="sp1-2">
       <span terminal="t1_1" category="cat1" id="sp1-1"/>
       <span child="sp1-1" rule="lex" category="NP" id="sp1-2"/>
     </ccg>
   </sentence>
 """
     sentence = etree.fromstring(sentence_str)
     ccg_tree = sentence.find("ccg")
     ccg_root = build_ccg_tree(ccg_tree)
     tokens = sentence.find("tokens")
     attributes = get_attributes_from_ccg_node_recursively(ccg_root, tokens)
     expected_attributes = {
         'category': 'NP',
         'rule': 'lex',
         'id': 'sp1-2',
         'child': 'sp1-1',
         'child0_terminal': 't1_1',
         'child0_surf': 'surf1',
         'child0_category': 'cat1',
         'child0_id': 'sp1-1'
     }
     self.assertEqual(len(expected_attributes), len(attributes))
     for k in expected_attributes:
         self.assertEqual(expected_attributes.get(k, None),
                          attributes.get(k, None))
Пример #7
0
 def test_terminal(self):
     sentence_str = r"""
   <sentence id="s1">
     <tokens>
       <token base="base1" pos="pos1" surf="surf1" id="t1_1"/>
     </tokens>
     <ccg root="sp1-1">
       <span terminal="t1_1" category="cat1" end="2" begin="1" id="sp1-1"/>
     </ccg>
   </sentence>
 """
     sentence = etree.fromstring(sentence_str)
     ccg_tree = sentence.find("ccg")
     ccg_root = build_ccg_tree(ccg_tree)
     tokens = sentence.find("tokens")
     attributes = get_attributes_from_ccg_node_recursively(ccg_root, tokens)
     expected_attributes = {
         'terminal': 't1_1',
         'category': 'cat1',
         'end': '2',
         'begin': '1',
         'id': 'sp1-1',
         'base': 'base1',
         'pos': 'pos1',
         'surf': 'surf1'
     }
     self.assertEqual(len(expected_attributes), len(attributes))
     for k in expected_attributes:
         self.assertEqual(expected_attributes[k], attributes[k])
Пример #8
0
 def test_nonterminal2(self):
     sentence_str = r"""
   <sentence id="s1">
     <tokens>
       <token surf="surf1" id="t1_1"/>
       <token surf="surf2" id="t1_2"/>
     </tokens>
     <ccg root="sp1-5">
       <span terminal="t1_1" category="cat1" id="sp1-1"/>
       <span terminal="t1_2" category="cat2" id="sp1-2"/>
       <span child="sp1-1" rule="lex1" category="NP1" id="sp1-3"/>
       <span child="sp1-2" rule="lex2" category="NP2" id="sp1-4"/>
       <span child="sp1-3 sp1-4" rule="rr" category="NPP" id="sp1-5"/>
     </ccg>
   </sentence>
 """
     sentence = etree.fromstring(sentence_str)
     ccg_tree = sentence.find("ccg")
     ccg_root = build_ccg_tree(ccg_tree)
     tokens = sentence.find("tokens")
     attributes = get_attributes_from_ccg_node_recursively(ccg_root, tokens)
     expected_attributes = {
         'category': 'NPP',
         'rule': 'rr',
         'id': 'sp1-5',
         'child': 'sp1-3 sp1-4',
         'child0_category': 'NP1',
         'child0_rule': 'lex1',
         'child0_id': 'sp1-3',
         'child0_child': 'sp1-1',
         'child0_child0_terminal': 't1_1',
         'child0_child0_surf': 'surf1',
         'child0_child0_category': 'cat1',
         'child0_child0_id': 'sp1-1',
         'child1_category': 'NP2',
         'child1_rule': 'lex2',
         'child1_id': 'sp1-4',
         'child1_child': 'sp1-2',
         'child1_child0_terminal': 't1_2',
         'child1_child0_surf': 'surf2',
         'child1_child0_category': 'cat2',
         'child1_child0_id': 'sp1-2'
     }
     self.assertEqual(
         len(expected_attributes), len(attributes),
         '\n{0}\nvs.\n{1}'.format(expected_attributes, attributes))
     for k in expected_attributes:
         self.assertEqual(expected_attributes.get(k, None),
                          attributes.get(k, None))
def convert_vertical_to_mathml(doc, verbatim_strings=[], use_gold_trees=False):
    """
    This function expects a list of ccg_trees, and a list of tokens
    (as produced by transccg). Then, it converts each pair (ccg_tree, ccg_tokens)
    into a presentation MathML string, and wraps them with HTML code.
    verbatim_strings contains a list of strings that should be printed
    verbatim at the end of the HTML document, for debugging.
    """
    ccg_trees = []
    if use_gold_trees:
        for sentence in doc.xpath('//sentence'):
            gold_tree_index = int(sentence.get('gold_tree', '0'))
            ccg_trees.append(sentence.xpath('./ccg')[gold_tree_index])
        ccg_trees = [build_ccg_tree(c) for c in ccg_trees]
    else:
        ccg_trees = [build_ccg_tree(c) for c in doc.xpath('//sentence/ccg[1]')]
    sem_trees = [build_ccg_tree(c) for c in doc.xpath('//semantics')]
    if not sem_trees:
        sem_trees = [None] * len(ccg_trees)
    tokens = doc.xpath('//tokens')
    assert len(ccg_trees) == len(tokens)
    num_hypotheses = len(ccg_trees) - 1
    sentence_ids = [
        "Premise {0}: ".format(i + 1) for i in range(num_hypotheses)
    ]
    sentence_ids.append("Conclusion: ")
    mathml_str = ""
    for i in range(len(ccg_trees)):
        sentence_surface = ' '.join(tokens[i].xpath('token/@surf'))
        mathml_str += "<p>" + sentence_ids[i] + sentence_surface + "</p>\n" \
                    + "<math xmlns='http://www.w3.org/1998/Math/MathML'>" \
                    + convert_node_to_mathml(ccg_trees[i], sem_trees[i], tokens[i]) \
                    + "</math>"

    verbatim_text = ""
    if verbatim_strings:
        verbatim_text = "<p>Script piped to coq</p>"
        for vb_str in verbatim_strings:
            verbatim_text += "<pre>\n" + vb_str + "\n</pre>\n"

    html_str = """\
  <!doctype html>
  <html lang='en'>
  <head>
    <meta charset='UTF-8'>
    <title>CCG to Lambda conversion</title>
    <style>
      body {
        font-size: 1em;
      }
    </style>
    <script type="text/javascript"
            src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
    </script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
  tex2jax: {
    inlineMath: [['$','$'], ['\\(','\\)']],
    processEscapes: true
  },
  CommonHTML: { matchFontHeight: false },
  displayAlign: "left",
  displayIndent: "2em"
});
MathJax.Hub.Config({
  "HTML-CSS": {
    availableFonts: [],
    preferredFont: null,
    webFont: "Neo-Euler"
  }
});
</script>
  </head>
  <body>
  """
    html_str += mathml_str
    html_str += verbatim_text
    html_str += """\
  </body>
  </html>
  """
    return html_str
Пример #10
0
def convert_doc_to_mathml_(doc, verbatim_strings=[], use_gold_trees=False):
    """
    This function expects a list of ccg_trees, and a list of tokens
    (as produced by transccg). Then, it converts each pair (ccg_tree, ccg_tokens)
    into a presentation MathML string, and wraps them with HTML code.
    verbatim_strings contains a list of strings that should be printed
    verbatim at the end of the HTML document, for debugging.
    """
    ccg_trees = []
    if use_gold_trees:
        for sentence in doc.xpath('./sentences/sentence'):
            gold_tree_index = int(sentence.get('gold_tree', '0'))
            ccg_trees.append(sentence.xpath('./ccg')[gold_tree_index])
        ccg_trees = [build_ccg_tree(c) for c in ccg_trees]
    else:
        ccg_trees = [
            build_ccg_tree(c) for c in doc.xpath('./sentences/sentence/ccg[1]')
        ]
    sem_trees = [
        build_ccg_tree(c) for c in doc.xpath('./sentences/sentence/semantics')
    ]
    if not sem_trees:
        sem_trees = [None] * len(ccg_trees)
    tokens = doc.xpath('./sentences/sentence/tokens')
    assert len(ccg_trees) == len(tokens)
    num_hypotheses = len(ccg_trees) - 1
    sentence_ids = [
        "Premise {0}: ".format(i + 1) for i in range(num_hypotheses)
    ]
    sentence_ids.append("Conclusion: ")
    mathml_str = ""
    for i in range(len(ccg_trees)):
        sentence_surface = ' '.join(tokens[i].xpath('token/@surf'))
        mathml_str += "<p>" + sentence_ids[i] + sentence_surface + "</p>\n" \
                    + "<math xmlns='http://www.w3.org/1998/Math/MathML'>" \
                    + convert_node_to_mathml(ccg_trees[i], sem_trees[i], tokens[i]) \
                    + "</math>"

    verbatim_text = ""
    if verbatim_strings:
        verbatim_text = "<p>Script piped to coq</p>"
        for vb_str in verbatim_strings:
            verbatim_text += "<pre>\n" + vb_str + "\n</pre>\n"

    html_str = """\
  <!doctype html>
  <html lang='en'>
  <head>
    <style>
      body {
        font-size: 1em;
      }
    </style>
    <meta charset='UTF-8'>
    <title>CCG to Lambda conversion</title>
    <script type="text/javascript"
            src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
    </script>
  </head>
  <body>
  """
    html_str += mathml_str
    html_str += verbatim_text
    html_str += """\
  </body>
  </html>
  """
    return cgi.escape(html_str)
Пример #11
0
def convert_root_to_mathml(root, verbatim_strings=[], use_gold_trees=False):
    """
    This function expects an XML root. Then, it converts each document doc
    into a presentation MathML string, and wraps them with HTML code.
    verbatim_strings is a list of strings that should be printed verbatim at
    the end of the HTML document, for debugging.
    """
    doc_mathml_strs = []
    for doc_ind, doc in enumerate(root.xpath('./document')):
        doc_id = doc.get('id', doc_ind)
        num_sentences = int(doc.xpath('count(./sentences/sentence)'))
        mathml_str = ""
        for sent_ind, sentence in enumerate(doc.xpath('./sentences/sentence')):
            gold_tree_index = int(sentence.get('gold_tree', -1))
            if sent_ind < num_sentences - 1:
                sentence_label = 'Premise {0}'.format(sent_ind)
            else:
                sentence_label = 'Conclusion'
            sentence_text = get_surf_from_xml_node(sentence)
            ccg_trees = sentence.xpath('./ccg')
            sem_trees = sentence.xpath('./semantics')
            tokens = sentence.xpath('./tokens')[0]
            assert len(ccg_trees) >= len(sem_trees)
            for i in range(len(ccg_trees)):
                ccg_tree_id = ccg_trees[i].get('id', str(i))
                ccg_tree = build_ccg_tree(ccg_trees[i])
                if gold_tree_index == i:
                    ccg_tree_id += " (gold)"
                sem_tree = None if i >= len(sem_trees) else sem_trees[i]
                if sem_tree is not None:
                    sem_tree = build_ccg_tree(sem_tree)
                mathml_str += "<p>{0}, tree {1}: {2}</p>\n".format(
                                sentence_label, ccg_tree_id, sentence_text) \
                            + "<math xmlns='http://www.w3.org/1998/Math/MathML'>\n" \
                            + convert_node_to_mathml(ccg_tree, sem_tree, tokens) \
                            + "</math>\n"
        verbatim_text = ""
        if verbatim_strings:
            verbatim_text = "<p>Script piped to coq</p>"
            for vb_str in verbatim_strings:
                verbatim_text += "<pre>\n" + vb_str + "\n</pre>\n"
        doc_mathml_str = '{0}\n{1}'.format(mathml_str, verbatim_text)
        doc_mathml_strs.append(doc_mathml_str)

    html_str = """\
  <!doctype html>
  <html lang='en'>
  <head>
    <meta charset='UTF-8'>
    <title>CCG to Lambda conversion</title>
    <style>
      body {
        font-size: 1em;
      }
    </style>
    <script type="text/javascript"
            src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
    </script>
  </head>
  <body>
  """
    html_str += '\n'.join([s for s in doc_mathml_strs])
    html_str += """\
  </body>
  </html>
  """
    return html_str
Пример #12
0
def convert_doc_to_mathml(doc, verbatim_strings = [], use_gold_trees=False):
    """
    This function expects a list of ccg_trees, and a list of tokens
    (as produced by transccg). Then, it converts each pair (ccg_tree, ccg_tokens)
    into a presentation MathML string, and wraps them with HTML code.
    verbatim_strings contains a list of strings that should be printed
    verbatim at the end of the HTML document, for debugging.
    """
    ccg_trees = []
    if use_gold_trees:
        for sentence in doc.xpath('//sentence'):
            gold_tree_index = int(sentence.get('gold_tree', '0'))
            ccg_trees.append(sentence.xpath('./ccg')[gold_tree_index])
        ccg_trees = [build_ccg_tree(c) for c in ccg_trees]
    else:
        ccg_trees = [build_ccg_tree(c) for c in doc.xpath('//sentence/ccg[1]')]
    sem_trees = [build_ccg_tree(c) for c in doc.xpath('//semantics')]
    if not sem_trees:
        sem_trees = [None] * len(ccg_trees)
    tokens = doc.xpath('//tokens')
    assert len(ccg_trees) == len(tokens) 
    num_hypotheses = len(ccg_trees) - 1
    sentence_ids = ["Premise {0}: ".format(i + 1) for i in range(num_hypotheses)]
    sentence_ids.append("Conclusion: ")
    mathml_str = ""
    for i in range(len(ccg_trees)):
        sentence_surface = ' '.join(tokens[i].xpath('token/@surf'))
        mathml_str += "<p>" + sentence_ids[i] + sentence_surface + "</p>\n" \
                    + "<math xmlns='http://www.w3.org/1998/Math/MathML'>" \
                    + convert_node_to_mathml(ccg_trees[i], sem_trees[i], tokens[i]) \
                    + "</math>"

    verbatim_text = ""
    if verbatim_strings:
       verbatim_text = "<p>Script piped to coq</p>"
       for vb_str in verbatim_strings:
           verbatim_text += "<pre>\n" + vb_str + "\n</pre>\n"

    html_str = """\
  <!doctype html>
  <html lang='en'>
  <head>
    <meta charset='UTF-8'>
    <title>CCG to Lambda conversion</title>
    <style>
      body {
        font-size: 1em;
      }
    </style>
    <script type="text/javascript"
            src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
    </script>
  </head>
  <body>
  """
    html_str += mathml_str
    html_str += verbatim_text
    html_str += """\
  </body>
  </html>
  """
    return html_str