def _convert_row(doc_date, xml): """ Given a default date (for the whole document) and a (date, entry) row, return * a (partial) iso string for the date * the text for the entry """ ths = list(xml.iter('th')) tds = list(xml.iter('td')) if len(ths) < 1: date = None elif len(ths) > 1: ET.dump(xml) raise Exception("Did not expect more than one th node") else: th_text = _clean_date(ths[0].text or "") date = read_date(th_text, prefix=doc_date, fuzzy=True) # or doc_date columns = ths + tds text = "\n".join(_column_to_text(x) for x in columns) if text: return date or doc_date, text else: return None
def _convert_section(xml): """ string representation of entire document (WARNING: mutates the tree) """ some = lambda l: [x for x in l if x is not None] dates = some(read_date(_clean_date(x.text)) for x in xml.iter('head')) section_date = dates[0] if dates else None for br_node in xml.iter('br'): br_node.text = "\n" return [_convert_row(section_date, r) for r in xml.iter('tr')]
def _write_membrane(ntext, oprefix): """ Write out text for an individual membrane (may involve multiple files) """ lines = ntext.split("\n") mname = _membrane_name(lines[0]) subentries = lines[1:] digits = _digits(subentries) for i, line in enumerate(subentries): date_str = " ".join(line.split()[:3]) try: date = read_date(date_str, fuzzy=True) except ValueError as _: date = None filename = "-".join([oprefix, mname, str(i + 1).zfill(digits)]) with codecs.open(filename, 'w', 'utf-8') as stream: if date is not None: print(date, file=stream) print(line, file=stream)
def _write_membrane(ntext, oprefix): """ Write out text for an individual membrane (may involve multiple files) """ lines = ntext.split("\n") mname = _membrane_name(lines[0]) subentries = lines[1:] digits = _digits(subentries) for i, line in enumerate(subentries): date_str = " ".join(line.split()[:3]) try: date = read_date(date_str, fuzzy=True) except ValueError as _: date = None filename = "-".join([oprefix, mname, str(i+1).zfill(digits)]) with codecs.open(filename, 'w', 'utf-8') as stream: if date is not None: print(date, file=stream) print(line, file=stream)
def assertDateEqual(self, expected, dstr, **kwargs): "assert that date string parses as expected" self.assertEqual(expected, read_date(dstr, **kwargs))