示例#1
0
def createSchema(result, table_name):
    '''Function that takes an idigbio query in dict format as argument "result" 
       and creates a database table schema based on based on the fields
       (dictionary keys) present in the query. 
       Goes through field names in each record in the query and adds distinct ones to table
    '''
    #Check that table does not already exist in database
    if tableExists(table_name):
        print("Table '" + table_name + "' already exists.")
        sys.exit(0)
    
    #Connect to database, DB info can be set in DBInfo.py file
    connection = connectDB()

    #Initialize cursor, attribute of psycopg2 "connection" object
    cursor = connection.cursor()
    
    create_command = "CREATE TABLE " + table_name + "()"
    
    #Create a new table in the database
    try:
        cursor.execute(create_command)
        connection.commit()
    except psycopg2.ProgrammingError as e:
        print("Table not created successfully.")
        print(e.pgerror)
        sys.exit()

    #Extract record type data from idigbio API endpoint
    raw_data = urllib.request.urlopen("http://search.idigbio.org/v2/meta/fields/records")

    #Decode data and convert to JSON/ Python dictionary
    record_types_dict = json.loads(raw_data.read().decode())

    #Fields that require special treatment (longer than 200 chars, special type etc.)
    special_fields = ["typestatus", "data", "datecollected", "datemodified", 
                      "indexData", "flags", "recordids", "locality", 
                      "verbatimlocality", "collector", "commonnames", 
                      "mediarecords", "highertaxon"]


    #Iterate through each record in idigbio query result dictionary 
    for record in result["items"]:
        #Query database to find columns it currently contains
        select_command = "SELECT * FROM " + table_name + " LIMIT 0"
        cursor.execute(select_command)
        
        #Make a list of the fields in database currently
        table_fields = [desc[0] for desc in cursor.description]
        
        #Iterate through fields in current record
        for field in record["indexTerms"]:
            #Base string for PSQL column addition command
            add_col = "ALTER TABLE " + table_name + " ADD COLUMN \""
            
            #Handle special case fields (defined in special_fields list)
            if field in special_fields and field not in table_fields:
                '''
                TODO: Change below fields with JSON data type to jsonb type in database
                '''
                #Special case: data field with JSON datastructure (Change later)
                if field == "indexData":
                    add_col += field + "\" TEXT"
                    
                #Special case: Date fields
                elif field == "datecollected":
                    add_col += field + "\" DATE"
                        
                elif field == "datemodified":
                    add_col += field + "\" DATE"
                            
                #Special case: data field with JSON structure (Change to JSON type later)
                elif field == "data":
                    add_col += field + "\" TEXT"
                
                #Other cases treated as strings of unknown length (TEXT)
                else:
                    add_col += field + "\" TEXT"
                
                #Execute command built
                cursor.execute(add_col)
                continue
            
            
            #Handle remaining fields. Check that field doesn't already exist in DB
            if field not in special_fields and field not in table_fields:
                #Extract field's designated type from API data
                field_type = record_types_dict[field]["type"]
            
                #Construct appropriate add command
                if field_type == "string":
                    #CREATE COLUMN VARCHAR 200
                    add_col += field + "\" VARCHAR(200)"
                elif field_type == "float":
                    #CREATE COLUMN DECIMAL
                    add_col += field + "\" DECIMAL"
                elif field_type == "integer":
                    #CREATE COLUMN INTEGER
                    add_col += field + "\" INTEGER"
                elif field_type == "boolean":
                    #CREATE COLUMN BOOLEAN
                    add_col += field + "\" BOOLEAN"
               
                #Handles special types like geopoint etc.
                else:
                    #CREATE COLUMN VARCHAR 200
                    add_col += field + "\" VARCHAR(200)"
            
                #Execute column addition command and save changes to DB
                cursor.execute(add_col)
                continue
        
    
    #Close connection to DB
    connection.commit()
    cursor.close()
    connection.close()
    
    print("Database table " + table_name + " has been created successfully.")
示例#2
0
def populateTable(result, table_name):
    '''Function that takes idigbio query in dictionary format as argument variable "result"
    and inputs data contained within it into a database table, table name is passed
    to function as argument. Goes through each record in the query, extracts its
    field names & values, then builds them into an insert command, and finally
    executes the command.
    '''
    #Connect to database using DBInfo.py script (Database defined in this script)
    connection = connectDB()
    #Initialize cursor (psycopg2 connection object attribute)
    cursor = connection.cursor()

    #Iterate through records in query
    for record in result["items"]:
        #Insert command base string
        insert_base = "INSERT INTO " + table_name + " "

        #Command string of list of keys (table field names) to be inserted into database
        insert_keys = "("
        #Command string of list of values (table field values) to be inserted into database
        insert_values = "("

        #Create a list of key-value pair tuples from query data
        keys_values = list(record["indexTerms"].items())

        #For each pair, append to appropriate command list string
        for i in range(0, len(keys_values)):
            #Data key, same as database field name
            key = keys_values[i][0]

            #Convert values to string format, indexData & geopoint dictionaries stored as json string
            if key == "indexData":
                value = json.dumps(keys_values[i][1])
            elif key == "geopoint":
                value = json.dumps(keys_values[i][1])
            else:
                value = str(keys_values[i][1])

            #Replacing single quotes with double single quotes to escape them
            #in query command (avoid mismatching single quotes)
            value = value.replace("'", "''")

            #Standardize dates (eliminate timezone & time)
            if key == "datecollected" or key == "datemodified":
                value = value[:10]

            #If last entry, leave seperating comma out
            if i == (len(keys_values) - 1):
                insert_keys += "\"" + key + "\")"
                insert_values += "'" + value + "')"
            else:
                insert_keys += "\"" + key + "\", "
                insert_values += "'" + value + "', "

        #Build entire command from command strings
        insert_command = insert_base + insert_keys + " VALUES " + insert_values

        #Attempt to execute command in database
        try:
            cursor.execute(insert_command)
        #If command fails, rollback the query and omit given record
        except connection.DataError as e:
            connection.rollback()
            print("There was an error inputting data of record " +
                  record["indexTerms"]["uuid"])
            print(
                "Reason: Field value length out of range. Ommitting record.\n")
            continue

        #Commit record to database
        connection.commit()

        #Move on to next record

    #At this point all records have been processed and inserted into DB
    print("Database table " + table_name + " has been populated successfully.")

    #Operations with DB complete, commit changes & close conenction to server
    cursor.close()
    connection.close()
示例#3
0
import urllib.request
import json
import psycopg2
import idigbio
import sys
from DBInfo import connectDB, tableExists

'''Python script for building a table schema for a PostgreSQL database based
on a query result from idigbio. Database details defined in DBInfo.py script's
connectDB() function.
'''


def createSchema(result, table_name):
    '''Function that takes an idigbio query in dict format as argument "result" 
       and creates a database table schema based on based on the fields
       (dictionary keys) present in the query. 
       Goes through field names in each record in the query and adds distinct ones to table
    '''
    #Check that table does not already exist in database
    if tableExists(table_name):
        print("Table '" + table_name + "' already exists.")
        sys.exit(0)
    
    #Connect to database, DB info can be set in DBInfo.py file
    connection = connectDB()

    #Initialize cursor, attribute of psycopg2 "connection" object
    cursor = connection.cursor()
    
    create_command = "CREATE TABLE " + table_name + "()"