def createSchema(result, table_name): '''Function that takes an idigbio query in dict format as argument "result" and creates a database table schema based on based on the fields (dictionary keys) present in the query. Goes through field names in each record in the query and adds distinct ones to table ''' #Check that table does not already exist in database if tableExists(table_name): print("Table '" + table_name + "' already exists.") sys.exit(0) #Connect to database, DB info can be set in DBInfo.py file connection = connectDB() #Initialize cursor, attribute of psycopg2 "connection" object cursor = connection.cursor() create_command = "CREATE TABLE " + table_name + "()" #Create a new table in the database try: cursor.execute(create_command) connection.commit() except psycopg2.ProgrammingError as e: print("Table not created successfully.") print(e.pgerror) sys.exit() #Extract record type data from idigbio API endpoint raw_data = urllib.request.urlopen("http://search.idigbio.org/v2/meta/fields/records") #Decode data and convert to JSON/ Python dictionary record_types_dict = json.loads(raw_data.read().decode()) #Fields that require special treatment (longer than 200 chars, special type etc.) special_fields = ["typestatus", "data", "datecollected", "datemodified", "indexData", "flags", "recordids", "locality", "verbatimlocality", "collector", "commonnames", "mediarecords", "highertaxon"] #Iterate through each record in idigbio query result dictionary for record in result["items"]: #Query database to find columns it currently contains select_command = "SELECT * FROM " + table_name + " LIMIT 0" cursor.execute(select_command) #Make a list of the fields in database currently table_fields = [desc[0] for desc in cursor.description] #Iterate through fields in current record for field in record["indexTerms"]: #Base string for PSQL column addition command add_col = "ALTER TABLE " + table_name + " ADD COLUMN \"" #Handle special case fields (defined in special_fields list) if field in special_fields and field not in table_fields: ''' TODO: Change below fields with JSON data type to jsonb type in database ''' #Special case: data field with JSON datastructure (Change later) if field == "indexData": add_col += field + "\" TEXT" #Special case: Date fields elif field == "datecollected": add_col += field + "\" DATE" elif field == "datemodified": add_col += field + "\" DATE" #Special case: data field with JSON structure (Change to JSON type later) elif field == "data": add_col += field + "\" TEXT" #Other cases treated as strings of unknown length (TEXT) else: add_col += field + "\" TEXT" #Execute command built cursor.execute(add_col) continue #Handle remaining fields. Check that field doesn't already exist in DB if field not in special_fields and field not in table_fields: #Extract field's designated type from API data field_type = record_types_dict[field]["type"] #Construct appropriate add command if field_type == "string": #CREATE COLUMN VARCHAR 200 add_col += field + "\" VARCHAR(200)" elif field_type == "float": #CREATE COLUMN DECIMAL add_col += field + "\" DECIMAL" elif field_type == "integer": #CREATE COLUMN INTEGER add_col += field + "\" INTEGER" elif field_type == "boolean": #CREATE COLUMN BOOLEAN add_col += field + "\" BOOLEAN" #Handles special types like geopoint etc. else: #CREATE COLUMN VARCHAR 200 add_col += field + "\" VARCHAR(200)" #Execute column addition command and save changes to DB cursor.execute(add_col) continue #Close connection to DB connection.commit() cursor.close() connection.close() print("Database table " + table_name + " has been created successfully.")
def populateTable(result, table_name): '''Function that takes idigbio query in dictionary format as argument variable "result" and inputs data contained within it into a database table, table name is passed to function as argument. Goes through each record in the query, extracts its field names & values, then builds them into an insert command, and finally executes the command. ''' #Connect to database using DBInfo.py script (Database defined in this script) connection = connectDB() #Initialize cursor (psycopg2 connection object attribute) cursor = connection.cursor() #Iterate through records in query for record in result["items"]: #Insert command base string insert_base = "INSERT INTO " + table_name + " " #Command string of list of keys (table field names) to be inserted into database insert_keys = "(" #Command string of list of values (table field values) to be inserted into database insert_values = "(" #Create a list of key-value pair tuples from query data keys_values = list(record["indexTerms"].items()) #For each pair, append to appropriate command list string for i in range(0, len(keys_values)): #Data key, same as database field name key = keys_values[i][0] #Convert values to string format, indexData & geopoint dictionaries stored as json string if key == "indexData": value = json.dumps(keys_values[i][1]) elif key == "geopoint": value = json.dumps(keys_values[i][1]) else: value = str(keys_values[i][1]) #Replacing single quotes with double single quotes to escape them #in query command (avoid mismatching single quotes) value = value.replace("'", "''") #Standardize dates (eliminate timezone & time) if key == "datecollected" or key == "datemodified": value = value[:10] #If last entry, leave seperating comma out if i == (len(keys_values) - 1): insert_keys += "\"" + key + "\")" insert_values += "'" + value + "')" else: insert_keys += "\"" + key + "\", " insert_values += "'" + value + "', " #Build entire command from command strings insert_command = insert_base + insert_keys + " VALUES " + insert_values #Attempt to execute command in database try: cursor.execute(insert_command) #If command fails, rollback the query and omit given record except connection.DataError as e: connection.rollback() print("There was an error inputting data of record " + record["indexTerms"]["uuid"]) print( "Reason: Field value length out of range. Ommitting record.\n") continue #Commit record to database connection.commit() #Move on to next record #At this point all records have been processed and inserted into DB print("Database table " + table_name + " has been populated successfully.") #Operations with DB complete, commit changes & close conenction to server cursor.close() connection.close()
import urllib.request import json import psycopg2 import idigbio import sys from DBInfo import connectDB, tableExists '''Python script for building a table schema for a PostgreSQL database based on a query result from idigbio. Database details defined in DBInfo.py script's connectDB() function. ''' def createSchema(result, table_name): '''Function that takes an idigbio query in dict format as argument "result" and creates a database table schema based on based on the fields (dictionary keys) present in the query. Goes through field names in each record in the query and adds distinct ones to table ''' #Check that table does not already exist in database if tableExists(table_name): print("Table '" + table_name + "' already exists.") sys.exit(0) #Connect to database, DB info can be set in DBInfo.py file connection = connectDB() #Initialize cursor, attribute of psycopg2 "connection" object cursor = connection.cursor() create_command = "CREATE TABLE " + table_name + "()"