def create_table(name, elements): #might be more efficient/pythonic to make a mapping, use names/type added in sync create_statement = "CREATE TABLE " + str(name) create_statement += " (id " + TYPE_CONVERSIONS[db_type]["id"] if name not in complex_types: if name != "source": create_statement += ", vip_id " + TYPE_CONVERSIONS[db_type]["int"] if name != "contest": create_statement += ", election_id " + TYPE_CONVERSIONS[db_type]["int"] create_statement += ", feed_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_statement += ", is_used " + TYPE_CONVERSIONS[db_type]["boolean"] for e in elements: if not "name" in e: if "elements" in e: for sub_e in e["elements"]: if "simpleContent" in sub_e: create_relation_table = "CREATE TABLE " + str(name) + "_" + sub_e["name"][:sub_e["name"].find("_id")] create_relation_table += "(vip_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += ",election_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += ",feed_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += "," + str(name) + "_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += "," + sub_e["name"] + " " + TYPE_CONVERSIONS[db_type]["xs:integer"] for attr in sub_e["simpleContent"]["attributes"]: create_relation_table += "," + attr["name"] + " " + TYPE_CONVERSIONS[db_type][attr["type"]] create_relation_table += ", is_used " + TYPE_CONVERSIONS[db_type]["boolean"] create_relation_table += ")" cursor.execute(create_relation_table) connection.commit() elif e["type"] == "complexType": if "simpleContent" in e: create_relation_table = "CREATE TABLE " + str(name) + "_" + e["name"][:e["name"].find("_id")] create_relation_table += "(vip_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += ",election_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += ",feed_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += "," + str(name) + "_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += "," + e["name"] + " " + TYPE_CONVERSIONS[db_type]["xs:integer"] for attr in e["simpleContent"]["attributes"]: create_relation_table += "," + attr["name"] + " " + TYPE_CONVERSIONS[db_type][attr["type"]] create_relation_table += ", is_used " + TYPE_CONVERSIONS[db_type]["boolean"] create_relation_table += ")" cursor.execute(create_relation_table) connection.commit() elif e["type"].startswith("xs:"): if "maxOccurs" in e and e["maxOccurs"] == "unbounded": create_relation_table = "CREATE TABLE " + str(name) + "_" + e["name"][:e["name"].find("_id")] create_relation_table += "(vip_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += ",election_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += ",feed_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += "," + str(name) + "_id " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += "," + e["name"] + " " + TYPE_CONVERSIONS[db_type]["xs:integer"] create_relation_table += ", is_used " + TYPE_CONVERSIONS[db_type]["boolean"] + ")" cursor.execute(create_relation_table) connection.commit() else: create_statement += ", " + str(e["name"]) create_statement += " " + TYPE_CONVERSIONS[db_type][e["type"]] else: if e["type"] in simple_types: create_statement += ", " + str(e["name"]) if db_type == "sqlite3": create_statement += " TEXT" elif db_type == "mysql": simple_elements = list(set(elem.lower() for elem in schema.get_element_list("simpleType", e["type"]))) create_statement += " ENUM('" create_statement += "','".join(simple_elements) create_statement += "')" elif db_type == "postgres": create_statement += " " + e["type"] elif e["type"] in complex_types: create_statement += ", " + str(e["name"]) + "_id " create_statement += TYPE_CONVERSIONS[db_type]["xs:integer"] create_statement += ", last_modified " + TYPE_CONVERSIONS[db_type]["timestamp"] if db_type == "sqlite3": create_statement += " DEFAULT CURRENT_TIMESTAMP NOT NULL" elif db_type == "mysql": create_statement += " default now() on update now() " elif db_type == "postgres": create_statement += " DEFAULT CURRENT_TIMESTAMP " create_statement += ", date_created " + TYPE_CONVERSIONS[db_type]["timestamp"] if db_type == "sqlite3": create_statement += " DEFAULT CURRENT_TIMESTAMP NOT NULL" if db_type == "mysql": create_statement += " default '0000-00-00 00:00:00' " elif db_type == "postgres": create_statement += " DEFAULT CURRENT_TIMESTAMP " create_statement += ");" cursor.execute(create_statement) connection.commit() if db_type == "postgres": create_trigger = "CREATE OR REPLACE FUNCTION update_last_modified() RETURNS TRIGGER AS $$ BEGIN NEW.lastmodified = NOW(); RETURN NEW; END; $$ LANGUAGE 'plpgsql'"; cursor.execute(create_trigger) connection.commit() elif db_type == "sqlite3": global trigger_count create_trigger = "CREATE TRIGGER update_last_modified" + str(trigger_count) + " AFTER INSERT ON " + str(name) + " BEGIN UPDATE " + str(name) + " SET last_modified = datetime('now') WHERE id = new." + str(name) + "; END;" cursor.execute(create_trigger) connection.commit() trigger_count += 1;
if parameters.username: username = parameters.username if parameters.password: password = parameters.password if db_type == "sqlite3": connection = sqlite3.connect(host) elif db_type == "mysql": connection = mdb.connect(host, username, password, db_name) elif db_type == "postgres": connection = psycopg2.connect(host=host, database=db_name, user=username, password=password) cursor = connection.cursor() fschema = urllib.urlopen(SCHEMA_URL) schema = schema.Schema(fschema) complex_types = schema.get_complexTypes() simple_types = schema.get_simpleTypes() elements = schema.get_element_list("element", "vip_object") if db_type == "postgres": for simple in simple_types: create_enum(simple, schema.get_element_list("simpleType", simple)) for complex_t in complex_types: create_table(complex_t, schema.get_sub_schema(complex_t)["elements"]) for element in elements: create_table(element, schema.get_sub_schema(element)["elements"])
#one option: multiple exports for flat files for the elements with multiple sub elements of the same type, fix was pulled #into or out of the database into the feed from lxml import etree import schema import urllib #fschema = urllib.urlopen("http://election-info-standard.googlecode.com/files/vip_spec_v2.3.xsd") fschema = open("schema.xsd") schema = schema.schema(fschema) simpleAddressTypes = schema.get_elements_of_attribute("type", "simpleAddressType") detailAddressTypes = schema.get_elements_of_attribute("type", "detailAddressType") ELEMENT_LIST = schema.get_element_list("element","vip_object") fname = 'test_feed.xml' xmlparser = etree.XMLParser() data = etree.parse(open(fname), xmlparser) root = data.getroot() elements = root.getchildren() element_name = "" sub_element_list = [] write_list = [] w = None for element in elements: if element.tag in ELEMENT_LIST:
import schema import urllib #This uses individual inserts for each new row for into the database, which is #done due max insert size problems we could run into. Also, I think this will #free up the database for other tasks when doing a large upload fschema = urllib.urlopen("http://election-info-standard.googlecode.com/files/vip_spec_v3.0.xsd") #fschema = open("schema.xsd") schema = schema.schema(fschema) simpleAddressTypes = schema.get_elements_of_attribute("type", "simpleAddressType") detailAddressTypes = schema.get_elements_of_attribute("type", "detailAddressType") ELEMENT_LIST = schema.get_element_list("element","vip_object") SIMPLECONTENTS = {) for elem in schema.schema["element"][0]["elements"]: for e in elem["elements"]: if "simpleContent" in e: if e["name"] in SIMPLECONTENTS: SIMPLECONTENTS[e["name"]]["parents"].append(elem['name']) else: SIMPLECONTENTS[e["name"]] = {"parents":[elem['name']]} UNBOUNDEDS = {} for elem in schema.schema["element"][0]["elements"]: for e in elem["elements"]: if "maxOccurs" in e and "simpleContent" not in e and e["maxOccurs"] == "unbounded": if e["name"] in UNBOUNDEDS: UNBOUNDEDS[e["name"]]["parents"].append(elem["name"])
import schema import urllib import os import csv #TODO: accomodate for sort_order as part of the relational tables fschema = urllib.urlopen("http://election-info-standard.googlecode.com/files/vip_spec_v3.0.xsd") #fschema = open("schema.xsd") schema = schema.schema(fschema) simpleAddressTypes = schema.get_elements_of_attribute("type", "simpleAddressType") detailAddressTypes = schema.get_elements_of_attribute("type", "detailAddressType") ELEMENT_LIST = schema.get_element_list("element","vip_object") SIMPLECONTENTS = {) for elem in schema.schema["element"][0]["elements"]: for e in elem["elements"]: if "simpleContent" in e: if e["name"] in SIMPLECONTENTS: SIMPLECONTENTS[e["name"]]["parents"].append(elem['name']) else: SIMPLECONTENTS[e["name"]] = {"parents":[elem['name']]} UNBOUNDEDS = {} for elem in schema.schema["element"][0]["elements"]: for e in elem["elements"]: if "maxOccurs" in e and "simpleContent" not in e and e["maxOccurs"] == "unbounded": if e["name"] in UNBOUNDEDS: UNBOUNDEDS[e["name"]]["parents"].append(elem["name"])