Exemplo n.º 1
0
 def test_equations(self):
     tree = etree.parse('tests/path_extraction.xml', parser=self.parser)
     el = tree.xpath('.//*[@id="zzz"]')[0]
     self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)(i)')
     el = tree.xpath('.//*[@id="yyy"]')[0]
     self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)')
     el = tree.xpath('.//*[@id="xxx"]')[0]
     self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)')
     el = tree.xpath('.//*[@id="aaa"]')[0]
     self.assertEqual(generate_path_string(el)[0], 'Test Act 666 sch 1 cl 1(1)')
Exemplo n.º 2
0
 def test_equations(self):
     tree = etree.parse('tests/path_extraction.xml', parser=self.parser)
     el = tree.xpath('.//*[@id="zzz"]')[0]
     self.assertEqual(
         generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)(i)')
     el = tree.xpath('.//*[@id="yyy"]')[0]
     self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)')
     el = tree.xpath('.//*[@id="xxx"]')[0]
     self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)')
     el = tree.xpath('.//*[@id="aaa"]')[0]
     self.assertEqual(
         generate_path_string(el)[0], 'Test Act 666 sch 1 cl 1(1)')
Exemplo n.º 3
0
def run(db, config):
    ids = set()
    with db.cursor() as cur, db.cursor() as out:
        cur.execute("""SELECT id, document FROM latest_instruments""")

        result = cur.fetchone()
        count = 0
        while result:
            if count % 100 == 0:
                print count
            count += 1

            for el in etree.fromstring(result[1]).xpath('//*[@id]'):
                new_id = el.attrib.get('id')
                if new_id not in ids:
                    query = """ INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES
                    (%(govt_id)s, %(parent_id)s, %(repr)s)"""
                    values = {
                        'govt_id': new_id,
                        'parent_id': result[0],
                        'repr': generate_path_string(el)[0]
                    }
                    out.execute(query, values)
                ids |= {new_id}
            result = cur.fetchone()
Exemplo n.º 4
0
def id_lookup(db):
    with db.cursor(cursor_factory=extras.RealDictCursor) as cur:
        cur.execute(""" delete from id_lookup""")
    with db.cursor(cursor_factory=extras.RealDictCursor, name="law_cursor") as cur, db.cursor() as out:
        cur.execute("""SELECT d.id, title, document FROM instruments i join documents d on i.id = d.id """)
        results = cur.fetchmany(1)
        count = 0
        id_results = []
        while len(results):
            for result in results:
                if count % 10 == 0:
                    print count, len(id_results)
                count += 1
                for el in etree.fromstring(result['document'], parser=p).findall('//*[@id]'):
                    new_id = el.attrib.get('id')
                    id_results.append( (new_id, result['id'], generate_path_string(el, title=unicode(result['title'].decode('utf-8')))[0]))
            results = cur.fetchmany(1)
            if len(id_results) > 100000:
                args_str = ','.join(cur.mogrify("(%s,%s,%s)", x) for x in id_results)
                out.execute("INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES " + args_str)
                id_results[:] = []

        if len(id_results):
            args_str = ','.join(cur.mogrify("(%s,%s,%s)", x) for x in id_results)
            out.execute("INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES " + args_str)
            id_results[:] = []
    db.commit()
Exemplo n.º 5
0
def instrument_location(instrument, location, args):
    def massage():
        return nodes_from_path_string(instrument.get_tree(), link_to_canonical(location))
    try:
        tree = nodes_from_path_string(instrument.get_tree(), location)
        if len(tree) == 1 and tree[0] == instrument.get_tree():
            raise CustomException('try again')
    except CustomException:
        tree = massage()
    full_location, _, path = generate_path_string(tree[0])
    tree = cull_tree(tree)
    return {
        'html_content': etree.tostring(tohtml(tree), encoding='UTF-8', method="html"),
        'title': instrument.title,
        'full_title': full_location,
        'document_id': instrument.id,
        'doc_type': 'instrument',
        "latest": instrument.attributes['latest'],
        "path": instrument.attributes['path'],
        "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']),
        'format': 'fragment',
        'query': {
            'doc_type': 'instrument',
            'document_id': instrument.id,
            'find': 'location',
            'location': path
        }
    }
Exemplo n.º 6
0
def run(db, config):
    ids = set()
    with db.cursor() as cur, db.cursor() as out:
        cur.execute("""SELECT id, document FROM latest_instruments""")

        result = cur.fetchone()
        count = 0
        while result:
            if count % 100 == 0:
                print count
            count += 1

            for el in etree.fromstring(result[1]).xpath('//*[@id]'):
                new_id = el.attrib.get('id')
                if new_id not in ids:
                    query = """ INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES
                    (%(govt_id)s, %(parent_id)s, %(repr)s)"""
                    values = {
                        'govt_id': new_id,
                        'parent_id': result[0],
                        'repr': generate_path_string(el)[0]
                    }
                    out.execute(query, values)
                ids |= {new_id}
            result = cur.fetchone()
Exemplo n.º 7
0
def find_references(tree, document_id, title, id_lookup):
    """ find and source all external references """
    links = map(lambda x: {'id': safe_target(x),
        'text': etree.tostring(x, method="text", encoding="UTF-8"),
        'path': generate_path_string(x, title=title)},
        tree.xpath('.//*[@href]|.//link[resourcepair]'))
    section = map(lambda x: (document_id, x['id'], x['path'][0], x['path'][1], x['text']), [l for l in links if id_lookup.get(l['id'])])
    return section
Exemplo n.º 8
0
def add_new_ids(tree, document_id, title, db=None):
    """ add the new ids to the id look up """
    id_results = []
    db = db or get_db()
    with db.cursor() as cur:
        for el in tree.xpath('//*[@id]'):
            new_id = el.attrib.get('id')
            id_results.append((new_id, document_id, generate_path_string(el, title=title)[0]))
        args_str = ','.join(cur.mogrify("(%s,%s,%s)", x) for x in id_results)
        cur.execute("INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES " + args_str)
    db.commit()
Exemplo n.º 9
0
def find_references(tree, document_id, title, id_lookup):
    """ find and source all external references """
    links = map(
        lambda x: {
            'id': safe_target(x),
            'text': etree.tostring(x, method="text", encoding="UTF-8"),
            'path': generate_path_string(x, title=title)
        }, tree.xpath('.//*[@href]|.//link[resourcepair]'))
    section = map(
        lambda x:
        (document_id, x['id'], x['path'][0], x['path'][1], x['text']),
        [l for l in links if id_lookup.get(l['id'])])
    return section
Exemplo n.º 10
0
def id_lookup(db):
    upsert = ""  #" ON CONFLICT (id_lookup_uniq) DO NOTHING"
    with db.cursor(cursor_factory=extras.RealDictCursor) as cur:
        cur.execute(""" delete from id_lookup""")
    with db.cursor(cursor_factory=extras.RealDictCursor,
                   name="law_cursor") as cur, db.cursor() as out:
        cur.execute(
            """SELECT d.id, title, document FROM instruments i join documents d on i.id = d.id """
        )
        results = cur.fetchmany(1)
        count = 0
        id_results = []
        completed = {}
        while len(results):
            for result in results:
                if count % 10 == 0:
                    print count, len(id_results)
                count += 1
                try:
                    for el in etree.fromstring(result['document'],
                                               parser=p).findall('.//*[@id]'):
                        new_id = el.attrib.get('id')
                        if (new_id, result['id']) not in completed:
                            entry = (
                                new_id, result['id'],
                                generate_path_string(
                                    el,
                                    title=unicode(
                                        result['title'].decode('utf-8')))[0])
                            id_results.append(entry)
                            completed[(new_id, result['id'])] = True
                except:
                    continue
            results = cur.fetchmany(1)
            if len(id_results) > 100000:
                args_str = ','.join(
                    cur.mogrify("(%s,%s,%s)", x) for x in id_results)
                out.execute(
                    "INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES " +
                    args_str + upsert)
                id_results[:] = []

        if len(id_results):
            args_str = ','.join(
                cur.mogrify("(%s,%s,%s)", x) for x in id_results)
            out.execute(
                "INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES " +
                args_str + upsert)
            id_results[:] = []
    db.commit()
Exemplo n.º 11
0
def add_new_ids(tree, document_id, title, db=None):
    """ add the new ids to the id look up """
    id_results = []
    db = db or get_db()
    with db.cursor() as cur:
        for el in tree.xpath('//*[@id]'):
            new_id = el.attrib.get('id')
            id_results.append(
                (new_id, document_id, generate_path_string(el,
                                                           title=title)[0]))
        args_str = ','.join(cur.mogrify("(%s,%s,%s)", x) for x in id_results)
        cur.execute("INSERT INTO id_lookup(govt_id, parent_id, repr) VALUES " +
                    args_str)
    db.commit()
Exemplo n.º 12
0
def instrument_govt_location(instrument, id, link_text, args):
    tree = decide_govt_or_path(instrument.get_tree(), id, link_text)
    full_location, _, location = generate_path_string(tree[0])
    tree = cull_tree(tree)
    return {
        'html_content': etree.tostring(tohtml(tree), encoding='UTF-8', method="html"),
        'title': instrument.title,
        'full_title': full_location,
        'document_id': instrument.id,
        'doc_type': 'instrument',
        "latest": instrument.attributes['latest'],
        "path": instrument.attributes['path'],
        "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']),
        'format': 'fragment',
        'query': {
            'doc_type': 'instrument',
            'document_id': instrument.id,
            'find': 'govt_location',
            'govt_location': id
        }
    }
Exemplo n.º 13
0
def find_all_definitions(tree, definitions, document_id, expire=True, title=None):
    nodes = tree.xpath(".//def-term[not(ancestor::skeletons)][not(ancestor::history)][not(ancestor::table)][not(ancestor::amend)][not(ancestor::schedule.amendments)]")
    parenthesis_pat = re.compile('.*(\(|\(the |\(a )$', flags=re.DOTALL)
    def get_parent(node):
        try:
            return node.iterancestors('def-para').next()
        except StopIteration:
            return node.iterancestors('para').next()
    count = 0
    for node in nodes:
        try:
            # super ugly hack to prevent placeholders likept 'A'
            text = re.sub('[][()]', '', node.itertext().next())


            # now if the preceeding text is a bracket, ignore this
            try:
                if (parenthesis_pat.match(node.xpath('preceding::text()[1]')[0]) and
                    node.xpath('following::text()[1]')[0][0] == ')'):
                    continue
            except IndexError:
                pass
            if len(text) > 1:
                # another hack:  if you are in a  label-para which is in a def-para, you aren't the primary definition
                try:
                    node.iterancestors('label-para').next().iterancestors('def-para').next()
                except StopIteration:
                    pass
                else:
                    continue
                parent = get_parent(node)
                # used to flag for find later, when we insert def into db (must survive serializing)
                temp_id = parent.attrib.get('temp-def-id', str(uuid.uuid4()))
                parent.attrib['temp-def-id'] = temp_id

                src = etree.Element('catalex-src')
                node.attrib['id'] = node.attrib.get('id', str(uuid.uuid4()))
                src.attrib['src'] = node.attrib.get('id')
                src.attrib['target-id'] = '%d' % document_id
                src.attrib['link-id'] = '%d-%d' % (document_id, count)
                src.text, src.attrib['href'], location = generate_path_string(node, title=title)
                src.attrib['location'] = location
                src_id = src.attrib['src']
                src = etree.tostring(src, method="html", encoding="UTF-8")
                if expire:
                    expiry_tags, priority, exclusive = infer_life_time(parent)
                else:
                    expiry_tags, priority, exclusive = ['root'], 100, False
                try:
                    context_parent = parent.iterancestors('para').next()
                    context = context_parent.xpath('./text')[0]
                    context_id = context.attrib.get('temp-def-id', str(uuid.uuid4()))
                    context.attrib['temp-def-id'] = context_id
                    result = {'context_id': context_id, 'temp_id': temp_id, 'src': src}
                except (StopIteration, IndexError):
                    result = {'temp_id': temp_id, 'src': src}
                definitions.add(Definition(full_word=text, results=[result],
                                id='%d-%s' % (document_id, src_id),
                                document_id=document_id, expiry_tags=expiry_tags,
                                priority=priority, exclusive=exclusive))
                count += 1

        except StopIteration:
            pass