def test_get_paragraphs_between(self): paragraphs = parsedoc.parse_docx_paragraphs(self.rawxml) items = parsedoc.get_paragraphs_between(paragraphs, 'Main items', 'Other stuff') self.assertEqual( ['Very large tomatoes', 'Huge apricots', 'Mediocre marrows'], items)
def test_get_paragraphs_containing(self): paragraphs = parsedoc.parse_docx_paragraphs(self.rawxml) items = parsedoc.get_paragraphs_between(paragraphs, 'Main items', 'Other stuff') measurements = parsedoc.get_paragraphs_containing( paragraphs, items, 'Measurement:') self.assertEqual( { 'Very large tomatoes': 'Measurement: 20cm', 'Huge apricots': 'Measurement: 15cm', 'Mediocre marrows': 'Measurement: 12cm' }, measurements)
def get_dataset_list(): """Parse and return expected array of descriptors per dataset.""" pathname = os.path.normpath( os.path.join(eustaceconfig.SYSTEM_PATH, DELIVERABLE_D4_3_RELATIVE_PATH)) inputfile = open(pathname, 'rb') rawxml = retrieve_document_xml(inputfile) paragraphs = parse_docx_paragraphs(rawxml, ignore_trailing_digits=True) names = get_paragraphs_between(paragraphs, DATASET_CONTENTS_START, DATASET_CONTENTS_END) paths = get_paragraphs_containing(paragraphs, names, eustaceconfig.WORKSPACE_PATH) result = [CatalogueDataSet(name, paths.get(name)) for name in names] return result
def test_parse_docx_paragraphs(self): paragraphs = parsedoc.parse_docx_paragraphs(self.rawxml) self.assertEqual('Test Document for Parsing with Python', paragraphs[0]) self.assertEqual('Very large tomatoes', paragraphs[5])
def load_document(self, pathname): """Open specified document (MS Word format) and read all paragraphs.""" spec = open(pathname, 'r') spec_xml = parsedoc.retrieve_document_xml(spec) self.paragraphs_all = parsedoc.parse_docx_paragraphs(spec_xml)