def test_paragraphs2(self):
     """Test paragraphs are correctly split with no closing element."""
     r = HtmlReader()
     d = r.parse('<p>First para<p>Second Para')
     self.assertEqual(len(d.elements), 2)
     self.assertEqual(d.elements[0].text, 'First para')
     self.assertEqual(d.elements[1].text, 'Second Para')
     for el in d.elements:
         self.assertIsInstance(el, Paragraph)
 def test_linebreak2(self):
     """Test br splits paragraph."""
     r = HtmlReader()
     d = r.parse('<span>First line</span><br/><span>Second line</span>')
     self.assertEqual(len(d.elements), 2)
     self.assertEqual(d.elements[0].text, 'First line')
     self.assertEqual(d.elements[1].text, 'Second line')
     for el in d.elements:
         self.assertIsInstance(el, Paragraph)
Пример #3
0
def read_html_paper(paper_path):
    """Opens a HTML paper and stores it as a chemdataextractor Document"""

    f = open(paper_path, 'rb')
    doc = Document.from_file(f, readers=[HtmlReader()])

    return doc
Пример #4
0
def extract_sentences(paper_path, para_yes):
    """extracts sentences from a paper into two lists, given that para_yes contains
    a list of document element numbers corresponding to paragraphs manually identified
    as those containing synthesis information"""

    f = open(paper_path, 'rb')
    doc = Document.from_file(f, readers=[HtmlReader()])

    sen_yes_arr = list()
    sen_no_arr = list()

    elem_all = np.arange(0, len(doc))
    para_no = np.delete(elem_all, para_yes)

    for i in para_no:
        if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph:
            for sentence in doc.elements[i]:
                sen_no_arr.append(sentence)

    for i in para_yes:
        if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph:
            for sentence in doc.elements[i]:
                sen_yes_arr.append(sentence)

    return sen_yes_arr, sen_no_arr