def test_equations(self): tree = etree.parse('tests/path_extraction.xml', parser=self.parser) el = tree.xpath('.//*[@id="zzz"]')[0] self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)(i)') el = tree.xpath('.//*[@id="yyy"]')[0] self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)') el = tree.xpath('.//*[@id="xxx"]')[0] self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)') el = tree.xpath('.//*[@id="aaa"]')[0] self.assertEqual(generate_path_string(el)[0], 'Test Act 666 sch 1 cl 1(1)')
def test_equations(self): tree = etree.parse('tests/path_extraction.xml', parser=self.parser) el = tree.xpath('.//*[@id="zzz"]')[0] self.assertEqual( generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)(i)') el = tree.xpath('.//*[@id="yyy"]')[0] self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)') el = tree.xpath('.//*[@id="xxx"]')[0] self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)') el = tree.xpath('.//*[@id="aaa"]')[0] self.assertEqual( generate_path_string(el)[0], 'Test Act 666 sch 1 cl 1(1)')
def run(db, config): ids = set() with db.cursor() as cur, db.cursor() as out: cur.execute("""SELECT id, document FROM latest_instruments""") result = cur.fetchone() count = 0 while result: if count % 100 == 0: print count count += 1 for el in etree.fromstring(result[1]).xpath('//*[@id]'): new_id = el.attrib.get('id') if new_id not in ids: query = """ INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES (%(govt_id)s, %(parent_id)s, %(repr)s)""" values = { 'govt_id': new_id, 'parent_id': result[0], 'repr': generate_path_string(el)[0] } out.execute(query, values) ids |= {new_id} result = cur.fetchone()
def id_lookup(db): with db.cursor(cursor_factory=extras.RealDictCursor) as cur: cur.execute(""" delete from id_lookup""") with db.cursor(cursor_factory=extras.RealDictCursor, name="law_cursor") as cur, db.cursor() as out: cur.execute("""SELECT d.id, title, document FROM instruments i join documents d on i.id = d.id """) results = cur.fetchmany(1) count = 0 id_results = [] while len(results): for result in results: if count % 10 == 0: print count, len(id_results) count += 1 for el in etree.fromstring(result['document'], parser=p).findall('//*[@id]'): new_id = el.attrib.get('id') id_results.append( (new_id, result['id'], generate_path_string(el, title=unicode(result['title'].decode('utf-8')))[0])) results = cur.fetchmany(1) if len(id_results) > 100000: args_str = ','.join(cur.mogrify("(%s,%s,%s)", x) for x in id_results) out.execute("INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES " + args_str) id_results[:] = [] if len(id_results): args_str = ','.join(cur.mogrify("(%s,%s,%s)", x) for x in id_results) out.execute("INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES " + args_str) id_results[:] = [] db.commit()
def instrument_location(instrument, location, args): def massage(): return nodes_from_path_string(instrument.get_tree(), link_to_canonical(location)) try: tree = nodes_from_path_string(instrument.get_tree(), location) if len(tree) == 1 and tree[0] == instrument.get_tree(): raise CustomException('try again') except CustomException: tree = massage() full_location, _, path = generate_path_string(tree[0]) tree = cull_tree(tree) return { 'html_content': etree.tostring(tohtml(tree), encoding='UTF-8', method="html"), 'title': instrument.title, 'full_title': full_location, 'document_id': instrument.id, 'doc_type': 'instrument', "latest": instrument.attributes['latest'], "path": instrument.attributes['path'], "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']), 'format': 'fragment', 'query': { 'doc_type': 'instrument', 'document_id': instrument.id, 'find': 'location', 'location': path } }
def find_references(tree, document_id, title, id_lookup): """ find and source all external references """ links = map(lambda x: {'id': safe_target(x), 'text': etree.tostring(x, method="text", encoding="UTF-8"), 'path': generate_path_string(x, title=title)}, tree.xpath('.//*[@href]|.//link[resourcepair]')) section = map(lambda x: (document_id, x['id'], x['path'][0], x['path'][1], x['text']), [l for l in links if id_lookup.get(l['id'])]) return section
def add_new_ids(tree, document_id, title, db=None): """ add the new ids to the id look up """ id_results = [] db = db or get_db() with db.cursor() as cur: for el in tree.xpath('//*[@id]'): new_id = el.attrib.get('id') id_results.append((new_id, document_id, generate_path_string(el, title=title)[0])) args_str = ','.join(cur.mogrify("(%s,%s,%s)", x) for x in id_results) cur.execute("INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES " + args_str) db.commit()
def find_references(tree, document_id, title, id_lookup): """ find and source all external references """ links = map( lambda x: { 'id': safe_target(x), 'text': etree.tostring(x, method="text", encoding="UTF-8"), 'path': generate_path_string(x, title=title) }, tree.xpath('.//*[@href]|.//link[resourcepair]')) section = map( lambda x: (document_id, x['id'], x['path'][0], x['path'][1], x['text']), [l for l in links if id_lookup.get(l['id'])]) return section
def id_lookup(db): upsert = "" #" ON CONFLICT (id_lookup_uniq) DO NOTHING" with db.cursor(cursor_factory=extras.RealDictCursor) as cur: cur.execute(""" delete from id_lookup""") with db.cursor(cursor_factory=extras.RealDictCursor, name="law_cursor") as cur, db.cursor() as out: cur.execute( """SELECT d.id, title, document FROM instruments i join documents d on i.id = d.id """ ) results = cur.fetchmany(1) count = 0 id_results = [] completed = {} while len(results): for result in results: if count % 10 == 0: print count, len(id_results) count += 1 try: for el in etree.fromstring(result['document'], parser=p).findall('.//*[@id]'): new_id = el.attrib.get('id') if (new_id, result['id']) not in completed: entry = ( new_id, result['id'], generate_path_string( el, title=unicode( result['title'].decode('utf-8')))[0]) id_results.append(entry) completed[(new_id, result['id'])] = True except: continue results = cur.fetchmany(1) if len(id_results) > 100000: args_str = ','.join( cur.mogrify("(%s,%s,%s)", x) for x in id_results) out.execute( "INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES " + args_str + upsert) id_results[:] = [] if len(id_results): args_str = ','.join( cur.mogrify("(%s,%s,%s)", x) for x in id_results) out.execute( "INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES " + args_str + upsert) id_results[:] = [] db.commit()
def add_new_ids(tree, document_id, title, db=None): """ add the new ids to the id look up """ id_results = [] db = db or get_db() with db.cursor() as cur: for el in tree.xpath('//*[@id]'): new_id = el.attrib.get('id') id_results.append( (new_id, document_id, generate_path_string(el, title=title)[0])) args_str = ','.join(cur.mogrify("(%s,%s,%s)", x) for x in id_results) cur.execute("INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES " + args_str) db.commit()
def instrument_govt_location(instrument, id, link_text, args): tree = decide_govt_or_path(instrument.get_tree(), id, link_text) full_location, _, location = generate_path_string(tree[0]) tree = cull_tree(tree) return { 'html_content': etree.tostring(tohtml(tree), encoding='UTF-8', method="html"), 'title': instrument.title, 'full_title': full_location, 'document_id': instrument.id, 'doc_type': 'instrument', "latest": instrument.attributes['latest'], "path": instrument.attributes['path'], "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']), 'format': 'fragment', 'query': { 'doc_type': 'instrument', 'document_id': instrument.id, 'find': 'govt_location', 'govt_location': id } }
def find_all_definitions(tree, definitions, document_id, expire=True, title=None): nodes = tree.xpath(".//def-term[not(ancestor::skeletons)][not(ancestor::history)][not(ancestor::table)][not(ancestor::amend)][not(ancestor::schedule.amendments)]") parenthesis_pat = re.compile('.*(\(|\(the |\(a )$', flags=re.DOTALL) def get_parent(node): try: return node.iterancestors('def-para').next() except StopIteration: return node.iterancestors('para').next() count = 0 for node in nodes: try: # super ugly hack to prevent placeholders likept 'A' text = re.sub('[][()]', '', node.itertext().next()) # now if the preceeding text is a bracket, ignore this try: if (parenthesis_pat.match(node.xpath('preceding::text()[1]')[0]) and node.xpath('following::text()[1]')[0][0] == ')'): continue except IndexError: pass if len(text) > 1: # another hack: if you are in a label-para which is in a def-para, you aren't the primary definition try: node.iterancestors('label-para').next().iterancestors('def-para').next() except StopIteration: pass else: continue parent = get_parent(node) # used to flag for find later, when we insert def into db (must survive serializing) temp_id = parent.attrib.get('temp-def-id', str(uuid.uuid4())) parent.attrib['temp-def-id'] = temp_id src = etree.Element('catalex-src') node.attrib['id'] = node.attrib.get('id', str(uuid.uuid4())) src.attrib['src'] = node.attrib.get('id') src.attrib['target-id'] = '%d' % document_id src.attrib['link-id'] = '%d-%d' % (document_id, count) src.text, src.attrib['href'], location = generate_path_string(node, title=title) src.attrib['location'] = location src_id = src.attrib['src'] src = etree.tostring(src, method="html", encoding="UTF-8") if expire: expiry_tags, priority, exclusive = infer_life_time(parent) else: expiry_tags, priority, exclusive = ['root'], 100, False try: context_parent = parent.iterancestors('para').next() context = context_parent.xpath('./text')[0] context_id = context.attrib.get('temp-def-id', str(uuid.uuid4())) context.attrib['temp-def-id'] = context_id result = {'context_id': context_id, 'temp_id': temp_id, 'src': src} except (StopIteration, IndexError): result = {'temp_id': temp_id, 'src': src} definitions.add(Definition(full_word=text, results=[result], id='%d-%s' % (document_id, src_id), document_id=document_id, expiry_tags=expiry_tags, priority=priority, exclusive=exclusive)) count += 1 except StopIteration: pass