Пример #1
0
def parse_file(dom, path_to_file, start_para_index):
    # Open the Word file. Use a cached json file if it exists
    # since that's faster that opening the raw .docx file.
    print('\nparsing {}'.format(path_to_file), file=sys.stderr)
    fhash = _hashfile(path_to_file)
    doc = None
    tmp_doc = "/tmp/doc.cache.{}.json".format(fhash)
    if os.path.exists(tmp_doc):
        doc = json.load(open(tmp_doc))
    if doc is None:
        doc = open_docx(path_to_file, pict=pict_handler)
        div_re.search('./2015-06/Division VIII.docx').group('div')
        for section in doc['sections']:
            for para_index, para in enumerate(section["paragraphs"],
                                              start_para_index):
                para['index'] = para_index
            start_para_index += len(section['paragraphs'])
        with open(tmp_doc, "w") as doccache:
            json.dump(doc, doccache, indent=2)
    try:
        # Parse each section.
        for section in doc["sections"]:
            parse_doc_section(section, dom)
    except:
        import traceback
        traceback.print_exc()
    return start_para_index
Пример #2
0
def main():
	# Use the West XML to get headings in titlecase. The Lexis document has
	# big level headings in all caps.
	west_dom = etree.parse(open("/home/user/data/dc_code/2012-12-11.xml", "rb"))
	for h in west_dom.xpath('//heading'):
		t = h.text.replace(" (Refs & Annos)", "")
		t = re.sub("[\s\.]+$", "", t)
		heading_case_fix[t.upper()] = t

	# Form the output DOM.
	dom = etree.Element("dc-code")
	meta = make_node(dom, "meta", None)
	make_node(meta, "title", "Code of the District of Columbia")
	make_node(meta, "recency", "current through DC Act 19-658; unofficial through D.C. Act 19-682")
	
	# Open the Word file. Use a cached json file if it exists
	# since that's faster that opening the raw .docx file.
	if not os.path.exists("/tmp/doc.cache.json"):
		doc = open_docx(sys.argv[1], pict=pict_handler)
		with open("/tmp/doc.cache.json", "w") as doccache:
			json.dump(doc, doccache, indent=2)
	else:
		doc = json.load(open("/tmp/doc.cache.json"))
	
	try:
		# Parse each section.
		state = { "stack": None }
		for section in doc["sections"]:
			parse_doc_section(section, dom, state)
	except:
		import traceback
		traceback.print_exc()

	# Output, being careful we get UTF-8 to the byte stream.
	sys.stdout.buffer.write(etree.tostring(dom, pretty_print=True, encoding="utf-8", xml_declaration=True))
def parse_file(dom, path_to_file, start_para_index):
	# Open the Word file. Use a cached json file if it exists
	# since that's faster that opening the raw .docx file.
	print('\nparsing {}'.format(path_to_file), file=sys.stderr)
	fhash = _hashfile(path_to_file)
	doc = None
	tmp_doc = "/tmp/doc.cache.{}.json".format(fhash)
	if os.path.exists(tmp_doc):
		doc = json.load(open(tmp_doc))
		print('loading from', tmp_doc, file=sys.stderr)
	else:
		print('saving to', tmp_doc, file=sys.stderr)

	if doc is None:
		doc = open_docx(path_to_file, pict=pict_handler)
		div_re.search('./2015-06/Division VIII.docx').group('div')
		for section in doc['sections']:
			for para_index, para in enumerate(section["paragraphs"], start_para_index):
				para['index'] = para_index
			start_para_index += len(section['paragraphs'])
		with open(tmp_doc, "w") as doccache:
			json.dump( doc, doccache, indent=2)
	try:
		# Parse each section.
		for section in doc["sections"]:
			parse_doc_section(section, dom)
	except:
		import traceback
		traceback.print_exc()
	return start_para_index
def parse_title(fn, dom, toc_location_stack):
	# Load the .docx file.
	doc = open_docx(fn, drawing=drawing_handler)
	
	# Parse each section.
	for section in doc["sections"]:
		parse_section(section, dom, toc_location_stack)
def main():
    # Use the West XML to get headings in titlecase. The Lexis document has
    # big level headings in all caps. Also get a list of words that always
    # appear in lowercase so we can correct the remaining titles reasonably well.
    west_dom = etree.parse(
        open("/home/user/data/dc_code/schema-2/2012-12-11.xml", "rb"))
    is_upper_word = set()
    for h in west_dom.xpath('//level[not(type="section")]/heading'):
        t = h.text.replace(" (Refs & Annos)", "")
        t = re.sub("[\s\.]+$", "", t)
        heading_case_fix[t.upper()] = t
        for wd in t.split(" "):
            if not re.search(r"[a-z]", wd): continue
            if wd == wd.lower():
                little_words.add(wd)
            else:
                is_upper_word.add(wd.lower())
    little_words.difference_update(is_upper_word)
    little_words.remove("disapproval")  # manual fix
    little_words.remove("abolished")  # manual fix

    # Form the output DOM.
    dom = etree.Element("level")
    dom.set("type", "document")
    make_node(dom, "heading", "Code of the District of Columbia")
    meta = make_node(dom, "meta", None)
    make_node(meta, "recency", sys.argv[2])

    # Open the Word file. Use a cached json file if it exists
    # since that's faster that opening the raw .docx file.
    fhash = hashfile(sys.argv[1])
    doc = None
    if os.path.exists("/tmp/doc.cache.json"):
        fdata = json.load(open("/tmp/doc.cache.json"))
        if fdata["hash"] == fhash:
            doc = fdata["doc"]
    if doc is None:
        doc = open_docx(sys.argv[1], pict=pict_handler)
        with open("/tmp/doc.cache.json", "w") as doccache:
            json.dump({"hash": fhash, "doc": doc}, doccache, indent=2)

    try:
        # Parse each section.
        state = {"stack": None}
        for section in doc["sections"]:
            parse_doc_section(section, dom, state)
    except:
        import traceback
        traceback.print_exc()

    # Output, being careful we get UTF-8 to the byte stream.
    sys.stdout.buffer.write(
        etree.tostring(dom,
                       pretty_print=True,
                       encoding="utf-8",
                       xml_declaration=True))
def parse_file(path_to_file):
	doc = open_docx(path_to_file, pict=pict_handler)
	paras = []
	for section in doc['sections']:
		for index, para in enumerate(section['paragraphs'], len(paras)):
			para['index'] = index
			para['indent'] = get_indent(para)
			for runs in para['runs']:
				runs['text'] = runs['text'].replace('\u201c', '"').replace('\u201d', '"')
		paras += section['paragraphs']

	return paras
def parse_file(path_to_file):
    doc = open_docx(path_to_file, pict=pict_handler)
    paras = []
    for section in doc['sections']:
        for index, para in enumerate(section['paragraphs'], len(paras)):
            para['index'] = index
            para['indent'] = get_indent(para)
            for runs in para['runs']:
                runs['text'] = runs['text'].replace('\u201c', '"').replace(
                    '\u201d', '"')
        paras += section['paragraphs']

    return paras
def main():
	# Use the West XML to get headings in titlecase. The Lexis document has
	# big level headings in all caps. Also get a list of words that always
	# appear in lowercase so we can correct the remaining titles reasonably well.
	west_dom = etree.parse(open("/home/user/data/dc_code/2012-12-11.xml", "rb"))
	is_upper_word = set()
	for h in west_dom.xpath('//level[not(type="Section")]/heading'):
		t = h.text.replace(" (Refs & Annos)", "")
		t = re.sub("[\s\.]+$", "", t)
		heading_case_fix[t.upper()] = t
		for wd in t.split(" "):
			if not re.search(r"[a-z]", wd): continue
			if wd == wd.lower():
				little_words.add(wd)
			else:
				is_upper_word.add(wd.lower())
	little_words.difference_update(is_upper_word)
	little_words.remove("disapproval") # manual fix
	little_words.remove("abolished") # manual fix

	# Form the output DOM.
	dom = etree.Element("level")
	make_node(dom, "type", "document")
	make_node(dom, "heading", "Code of the District of Columbia")
	meta = make_node(dom, "meta", None)
	make_node(meta, "recency", "current through DC Act 19-658; unofficial through D.C. Act 19-682")
	
	# Open the Word file. Use a cached json file if it exists
	# since that's faster that opening the raw .docx file.
	if not os.path.exists("/tmp/doc.cache.json"):
		doc = open_docx(sys.argv[1], pict=pict_handler)
		with open("/tmp/doc.cache.json", "w") as doccache:
			json.dump(doc, doccache, indent=2)
	else:
		doc = json.load(open("/tmp/doc.cache.json"))
	
	try:
		# Parse each section.
		state = { "stack": None }
		for section in doc["sections"]:
			parse_doc_section(section, dom, state)
	except:
		import traceback
		traceback.print_exc()

	# Output, being careful we get UTF-8 to the byte stream.
	sys.stdout.buffer.write(etree.tostring(dom, pretty_print=True, encoding="utf-8", xml_declaration=True))
  parent.append(n)
  n.text = text
  for k, v in attrs.items():
    if v is None: continue
    if isinstance(v, datetime.datetime):
      v = format_datetime(v)
    elif isinstance(v, (bool, int)):
      v = str(v)
    n.set(k.replace("___", ""), v)
  return n

# Form the output dom.
dom = etree.Element("measure")

# Load the .docx file.
doc = open_docx(sys.argv[1])

# Parse the header.

header_text = []
for sec in doc["header"]:
	for p in sec["paragraphs"]:
		header_text.append(" ".join(run["text"] for run in p["runs"]))
header_text = "\n".join(header_text)

stat_volume, stat_page, law_type, council_period, law_num, eff_date, exp_date = \
	re.match(r"COUNCIL OF THE DISTRICT OF COLUMBIA\s+(\d+) DCSTAT (\d+)\n"
			 r"(D.C. (?:Law|Act|Resolution)) (\d+)-(\d+), "
			 r"effective ([^(]*[^\s(])"
			 r"(?: \(Expiration date ([^(]*)\))?", header_text).groups()
Пример #10
0
    n.text = text
    for k, v in attrs.items():
        if v is None: continue
        if isinstance(v, datetime.datetime):
            v = format_datetime(v)
        elif isinstance(v, (bool, int)):
            v = str(v)
        n.set(k.replace("___", ""), v)
    return n


# Form the output dom.
dom = etree.Element("measure")

# Load the .docx file.
doc = open_docx(sys.argv[1])

# Parse the header.

header_text = []
for sec in doc["header"]:
    for p in sec["paragraphs"]:
        header_text.append(" ".join(run["text"] for run in p["runs"]))
header_text = "\n".join(header_text)

stat_volume, stat_page, law_type, council_period, law_num, eff_date, exp_date = \
 re.match(r"COUNCIL OF THE DISTRICT OF COLUMBIA\s+(\d+) DCSTAT (\d+)\n"
    r"(D.C. (?:Law|Act|Resolution)) (\d+)-(\d+), "
    r"effective ([^(]*[^\s(])"
    r"(?: \(Expiration date ([^(]*)\))?", header_text).groups()