Пример #1
0
 def test_get_paragraphs_between(self):
     paragraphs = parsedoc.parse_docx_paragraphs(self.rawxml)
     items = parsedoc.get_paragraphs_between(paragraphs, 'Main items',
                                             'Other stuff')
     self.assertEqual(
         ['Very large tomatoes', 'Huge apricots', 'Mediocre marrows'],
         items)
Пример #2
0
 def test_get_paragraphs_containing(self):
     paragraphs = parsedoc.parse_docx_paragraphs(self.rawxml)
     items = parsedoc.get_paragraphs_between(paragraphs, 'Main items',
                                             'Other stuff')
     measurements = parsedoc.get_paragraphs_containing(
         paragraphs, items, 'Measurement:')
     self.assertEqual(
         {
             'Very large tomatoes': 'Measurement: 20cm',
             'Huge apricots': 'Measurement: 15cm',
             'Mediocre marrows': 'Measurement: 12cm'
         }, measurements)
Пример #3
0
def get_dataset_list():
    """Parse and return expected array of descriptors per dataset."""
    pathname = os.path.normpath(
        os.path.join(eustaceconfig.SYSTEM_PATH,
                     DELIVERABLE_D4_3_RELATIVE_PATH))
    inputfile = open(pathname, 'rb')
    rawxml = retrieve_document_xml(inputfile)
    paragraphs = parse_docx_paragraphs(rawxml, ignore_trailing_digits=True)
    names = get_paragraphs_between(paragraphs, DATASET_CONTENTS_START,
                                   DATASET_CONTENTS_END)
    paths = get_paragraphs_containing(paragraphs, names,
                                      eustaceconfig.WORKSPACE_PATH)
    result = [CatalogueDataSet(name, paths.get(name)) for name in names]
    return result
Пример #4
0
 def test_parse_docx_paragraphs(self):
     paragraphs = parsedoc.parse_docx_paragraphs(self.rawxml)
     self.assertEqual('Test Document for Parsing with Python',
                      paragraphs[0])
     self.assertEqual('Very large tomatoes', paragraphs[5])
Пример #5
0
    def load_document(self, pathname):
        """Open specified document (MS Word format) and read all paragraphs."""

        spec = open(pathname, 'r')
        spec_xml = parsedoc.retrieve_document_xml(spec)
        self.paragraphs_all = parsedoc.parse_docx_paragraphs(spec_xml)