def parse_file(dom, path_to_file, start_para_index): # Open the Word file. Use a cached json file if it exists # since that's faster that opening the raw .docx file. print('\nparsing {}'.format(path_to_file), file=sys.stderr) fhash = _hashfile(path_to_file) doc = None tmp_doc = "/tmp/doc.cache.{}.json".format(fhash) if os.path.exists(tmp_doc): doc = json.load(open(tmp_doc)) if doc is None: doc = open_docx(path_to_file, pict=pict_handler) div_re.search('./2015-06/Division VIII.docx').group('div') for section in doc['sections']: for para_index, para in enumerate(section["paragraphs"], start_para_index): para['index'] = para_index start_para_index += len(section['paragraphs']) with open(tmp_doc, "w") as doccache: json.dump(doc, doccache, indent=2) try: # Parse each section. for section in doc["sections"]: parse_doc_section(section, dom) except: import traceback traceback.print_exc() return start_para_index
def main(): # Use the West XML to get headings in titlecase. The Lexis document has # big level headings in all caps. west_dom = etree.parse(open("/home/user/data/dc_code/2012-12-11.xml", "rb")) for h in west_dom.xpath('//heading'): t = h.text.replace(" (Refs & Annos)", "") t = re.sub("[\s\.]+$", "", t) heading_case_fix[t.upper()] = t # Form the output DOM. dom = etree.Element("dc-code") meta = make_node(dom, "meta", None) make_node(meta, "title", "Code of the District of Columbia") make_node(meta, "recency", "current through DC Act 19-658; unofficial through D.C. Act 19-682") # Open the Word file. Use a cached json file if it exists # since that's faster that opening the raw .docx file. if not os.path.exists("/tmp/doc.cache.json"): doc = open_docx(sys.argv[1], pict=pict_handler) with open("/tmp/doc.cache.json", "w") as doccache: json.dump(doc, doccache, indent=2) else: doc = json.load(open("/tmp/doc.cache.json")) try: # Parse each section. state = { "stack": None } for section in doc["sections"]: parse_doc_section(section, dom, state) except: import traceback traceback.print_exc() # Output, being careful we get UTF-8 to the byte stream. sys.stdout.buffer.write(etree.tostring(dom, pretty_print=True, encoding="utf-8", xml_declaration=True))
def parse_file(dom, path_to_file, start_para_index): # Open the Word file. Use a cached json file if it exists # since that's faster that opening the raw .docx file. print('\nparsing {}'.format(path_to_file), file=sys.stderr) fhash = _hashfile(path_to_file) doc = None tmp_doc = "/tmp/doc.cache.{}.json".format(fhash) if os.path.exists(tmp_doc): doc = json.load(open(tmp_doc)) print('loading from', tmp_doc, file=sys.stderr) else: print('saving to', tmp_doc, file=sys.stderr) if doc is None: doc = open_docx(path_to_file, pict=pict_handler) div_re.search('./2015-06/Division VIII.docx').group('div') for section in doc['sections']: for para_index, para in enumerate(section["paragraphs"], start_para_index): para['index'] = para_index start_para_index += len(section['paragraphs']) with open(tmp_doc, "w") as doccache: json.dump( doc, doccache, indent=2) try: # Parse each section. for section in doc["sections"]: parse_doc_section(section, dom) except: import traceback traceback.print_exc() return start_para_index
def parse_title(fn, dom, toc_location_stack): # Load the .docx file. doc = open_docx(fn, drawing=drawing_handler) # Parse each section. for section in doc["sections"]: parse_section(section, dom, toc_location_stack)
def main(): # Use the West XML to get headings in titlecase. The Lexis document has # big level headings in all caps. Also get a list of words that always # appear in lowercase so we can correct the remaining titles reasonably well. west_dom = etree.parse( open("/home/user/data/dc_code/schema-2/2012-12-11.xml", "rb")) is_upper_word = set() for h in west_dom.xpath('//level[not(type="section")]/heading'): t = h.text.replace(" (Refs & Annos)", "") t = re.sub("[\s\.]+$", "", t) heading_case_fix[t.upper()] = t for wd in t.split(" "): if not re.search(r"[a-z]", wd): continue if wd == wd.lower(): little_words.add(wd) else: is_upper_word.add(wd.lower()) little_words.difference_update(is_upper_word) little_words.remove("disapproval") # manual fix little_words.remove("abolished") # manual fix # Form the output DOM. dom = etree.Element("level") dom.set("type", "document") make_node(dom, "heading", "Code of the District of Columbia") meta = make_node(dom, "meta", None) make_node(meta, "recency", sys.argv[2]) # Open the Word file. Use a cached json file if it exists # since that's faster that opening the raw .docx file. fhash = hashfile(sys.argv[1]) doc = None if os.path.exists("/tmp/doc.cache.json"): fdata = json.load(open("/tmp/doc.cache.json")) if fdata["hash"] == fhash: doc = fdata["doc"] if doc is None: doc = open_docx(sys.argv[1], pict=pict_handler) with open("/tmp/doc.cache.json", "w") as doccache: json.dump({"hash": fhash, "doc": doc}, doccache, indent=2) try: # Parse each section. state = {"stack": None} for section in doc["sections"]: parse_doc_section(section, dom, state) except: import traceback traceback.print_exc() # Output, being careful we get UTF-8 to the byte stream. sys.stdout.buffer.write( etree.tostring(dom, pretty_print=True, encoding="utf-8", xml_declaration=True))
def parse_file(path_to_file): doc = open_docx(path_to_file, pict=pict_handler) paras = [] for section in doc['sections']: for index, para in enumerate(section['paragraphs'], len(paras)): para['index'] = index para['indent'] = get_indent(para) for runs in para['runs']: runs['text'] = runs['text'].replace('\u201c', '"').replace('\u201d', '"') paras += section['paragraphs'] return paras
def parse_file(path_to_file): doc = open_docx(path_to_file, pict=pict_handler) paras = [] for section in doc['sections']: for index, para in enumerate(section['paragraphs'], len(paras)): para['index'] = index para['indent'] = get_indent(para) for runs in para['runs']: runs['text'] = runs['text'].replace('\u201c', '"').replace( '\u201d', '"') paras += section['paragraphs'] return paras
def main(): # Use the West XML to get headings in titlecase. The Lexis document has # big level headings in all caps. Also get a list of words that always # appear in lowercase so we can correct the remaining titles reasonably well. west_dom = etree.parse(open("/home/user/data/dc_code/2012-12-11.xml", "rb")) is_upper_word = set() for h in west_dom.xpath('//level[not(type="Section")]/heading'): t = h.text.replace(" (Refs & Annos)", "") t = re.sub("[\s\.]+$", "", t) heading_case_fix[t.upper()] = t for wd in t.split(" "): if not re.search(r"[a-z]", wd): continue if wd == wd.lower(): little_words.add(wd) else: is_upper_word.add(wd.lower()) little_words.difference_update(is_upper_word) little_words.remove("disapproval") # manual fix little_words.remove("abolished") # manual fix # Form the output DOM. dom = etree.Element("level") make_node(dom, "type", "document") make_node(dom, "heading", "Code of the District of Columbia") meta = make_node(dom, "meta", None) make_node(meta, "recency", "current through DC Act 19-658; unofficial through D.C. Act 19-682") # Open the Word file. Use a cached json file if it exists # since that's faster that opening the raw .docx file. if not os.path.exists("/tmp/doc.cache.json"): doc = open_docx(sys.argv[1], pict=pict_handler) with open("/tmp/doc.cache.json", "w") as doccache: json.dump(doc, doccache, indent=2) else: doc = json.load(open("/tmp/doc.cache.json")) try: # Parse each section. state = { "stack": None } for section in doc["sections"]: parse_doc_section(section, dom, state) except: import traceback traceback.print_exc() # Output, being careful we get UTF-8 to the byte stream. sys.stdout.buffer.write(etree.tostring(dom, pretty_print=True, encoding="utf-8", xml_declaration=True))
parent.append(n) n.text = text for k, v in attrs.items(): if v is None: continue if isinstance(v, datetime.datetime): v = format_datetime(v) elif isinstance(v, (bool, int)): v = str(v) n.set(k.replace("___", ""), v) return n # Form the output dom. dom = etree.Element("measure") # Load the .docx file. doc = open_docx(sys.argv[1]) # Parse the header. header_text = [] for sec in doc["header"]: for p in sec["paragraphs"]: header_text.append(" ".join(run["text"] for run in p["runs"])) header_text = "\n".join(header_text) stat_volume, stat_page, law_type, council_period, law_num, eff_date, exp_date = \ re.match(r"COUNCIL OF THE DISTRICT OF COLUMBIA\s+(\d+) DCSTAT (\d+)\n" r"(D.C. (?:Law|Act|Resolution)) (\d+)-(\d+), " r"effective ([^(]*[^\s(])" r"(?: \(Expiration date ([^(]*)\))?", header_text).groups()
n.text = text for k, v in attrs.items(): if v is None: continue if isinstance(v, datetime.datetime): v = format_datetime(v) elif isinstance(v, (bool, int)): v = str(v) n.set(k.replace("___", ""), v) return n # Form the output dom. dom = etree.Element("measure") # Load the .docx file. doc = open_docx(sys.argv[1]) # Parse the header. header_text = [] for sec in doc["header"]: for p in sec["paragraphs"]: header_text.append(" ".join(run["text"] for run in p["runs"])) header_text = "\n".join(header_text) stat_volume, stat_page, law_type, council_period, law_num, eff_date, exp_date = \ re.match(r"COUNCIL OF THE DISTRICT OF COLUMBIA\s+(\d+) DCSTAT (\d+)\n" r"(D.C. (?:Law|Act|Resolution)) (\d+)-(\d+), " r"effective ([^(]*[^\s(])" r"(?: \(Expiration date ([^(]*)\))?", header_text).groups()