def test_same_xml_and_html(self): self.assertEqual(px.recutext_xml(self.xml_lst1[34]), px.recutext_html(self.html_lst1[34])) self.assertEqual(px.recutext_xml(self.xml_lst1[44]), px.recutext_html(self.html_lst1[44])) self.assertEqual(px.recutext_xml(self.xml_lst1[4]), px.recutext_html(self.html_lst1[4]))
def sample_article(f, ns, para_per_article=10, min_words=15): ''' Usage: f be a parsable xml tree try to get para_per_article paragraphs from this article min_words: the paragraph has to have more that this amount of words ''' try: exml = etree.parse(f, etree.XMLParser(remove_comments=True)) para_lst_nonrand = exml.findall('.//latexml:para', ns) para_lst = random.sample(para_lst_nonrand, para_per_article) except etree.ParseError: print('article %s could no be parsed' % f) para_lst = [] except ValueError as ve: print('article %s has few paragraphs: %s' % (f, ve)) para_lst = [] return_lst = [] for p in para_lst: if px.check_sanity(p, ns): para_text = px.recutext_xml(p) if len(para_text.split()) >= min_words: #check min_words return_lst.append(para_text) else: print('article %s has messed up para' % f) return return_lst
def para_tags(f, ns, min_words=0): ''' Usage: f be a parsable xml tree try to get para_per_article paragraphs from this article min_words: the paragraph has to have more that this amount of words ''' try: exml = ET.parse(f) para_lst = exml.findall('.//latexml:para', ns) except ET.ParseError: print('article %s could no be parsed' % f) para_lst = [] except ValueError: print('article %s has few paragraphs' % f) para_lst = [] return_lst = [] for p in para_lst: if px.check_sanity(p, ns): para_text = px.recutext_xml(p) if len(para_text.split()) >= min_words: #check min_words return_lst.append(para_text) else: print('article %s has messed up para' % f) return return_lst
def create_definition_branch(ind, defi): root = etree.Element("definition") root.attrib['index'] = repr(ind) statement = etree.SubElement(root, 'stmnt') statement.text = px.recutext_xml(defi) for d in get_definiendum(defi, ns): dfndum = etree.SubElement(root, 'dfndum') dfndum.text = d return root
print('{:15} {:>10} {:>10}'.format(s[0], y_true_tmp[k], predicted[k])) return y_true, y_pred # Prepare and print metrics for the normal metrics OO = prepare_for_metrics(119, chunker, data_set=test_samples, print_output=True) y_true, predicted = prepare_for_metrics(range(len(test_samples)), chunker) print(metrics.classification_report(y_true, predicted)) # - # An example of a user fed definition chunked = chunker.parse(pos_tag(word_tokenize(Def[0]))) D =list(filter(lambda x: isinstance(x, nltk.tree.Tree), chunked))[0] ' '.join([d[0] for d in D]) art = px.DefinitionsXML('tests/latexmled_files/1501.06563.xml') p_lst = [px.recutext_xml(p) for p in art.tag_list(tag='para')] p_vec = count_vect.transform(p_lst) preds = clf.predict(p_vec) for k,p in enumerate(p_lst): print(k,preds[k],p[:100]) print('------') chunk = tree2conlltags(chunker.parse(pos_tag(word_tokenize(p_lst[63])))) for tok in chunk: print('{:15} {:>10} '.format(tok[0], tok[2])) with open('../PickleJar/chunker.pickle', 'wb') as chunker_f: pickle.dump(chunker, chunker_f) with open('data/vectorizer.pickle', 'wb') as token_f:
def test_recutext_xml(self): expect1 = 'For the remaining properties we state we shall assume that _inline_math_ or _inline_math_.' expect2 = '''Let _inline_math_ be a set of elements of _inline_math_. Recall that an _inline_math_-invariant CAD of _inline_math_ _citation_ is a partitioning of _inline_math_ into connected subsets called cells compatible with the zeros of the elements of _inline_math_. The output of a CAD algorithm applied to _inline_math_ is a description of an _inline_math_-invariant CAD _inline_math_ of _inline_math_. That is, _inline_math_ is a decomposition of _inline_math_ determined by the roots of the elements of _inline_math_ over the cells of some cylindrical algebraic decomposition _inline_math_ of _inline_math_; each element of _inline_math_ is sign-invariant throughout every cell of _inline_math_.''' self.assertEqual(expect1, px.recutext_xml(self.xml_lst1[19])) self.assertEqual(expect2, px.recutext_xml(self.xml_lst1[32]))