def define_index(self, index_name, columns, kind="INDEX"): """ Define or replace and index definition. :param index_name: Index name, must be unique within a table. :param columns: Valid list of columns. :param kind: One of the valid index types. :return: """ for col in columns: if col not in self.column_nameset: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.invalid_index, message="Index column " + col + " is invalid.") if index_name in self.index_nameset: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.duplicate_index, message="Index " + index_name + " is duplicate.") self.index_definitions.append( IndexDefinition(index_name, kind, columns)) self.index_nameset.add(index_name) cursor = self.cnx.cursor() for i, col in enumerate(columns): cursor.execute("INSERT INTO indexes VALUES ('" + self.t_name + "','" + index_name + "','" + col + "','" + kind + "'," + str(i + 1) + ");") self.cnx.commit()
def drop_column_definition(self, c): """ Remove from definition and catalog tables. :param c: Column name (string) :return: """ exist = None for exist_column in self.column_definitions: if exist_column.column_name == c: exist = exist_column break if not exist: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.invalid_column, message="Column " + c + " is not in the column definition list.") if c not in self.column_nameset: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException. invalid_column_definition, message="Column " + c + " is invalid.") self.column_definitions.remove( exist) # remove column definition from the definition list cursor = self.cnx.cursor() cursor.execute("delete from columns where table_name = '" + self.t_name + "' and column_name = '" + c + "';") self.cnx.commit() # update index definitions, since there may be CASCADE for id in self.index_definitions: if c in id.columns: id.columns.remove(c) if id.columns == []: self.index_definitions.remove(id) self.index_nameset.remove(id.index_name)
def add_column_definition(self, c): """ Add a column definition. :param c: New column. Cannot be duplicate or column not in the file. :return: None """ for exist_column in self.column_definitions: if exist_column.column_name == c.column_name: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException. duplicate_column, message="Column " + c.column_name + " is duplicate.") if c.column_name not in self.columns: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException. invalid_column_definition, message="Column " + c.column_name + " is invalid.") self.column_definitions.append( c) # add column definition to the definition list cursor = self.cnx.cursor() cursor.execute("INSERT INTO columns VALUES ('" + self.t_name + "','" + c.column_name + "','" + c.column_type + "'," + ("TRUE" if c.not_null else "FALSE") + ");") self.cnx.commit() self.column_nameset.add(c.column_name)
def __find_by_template_scan__(self, t, fields=None, limit=None, offset=None): """ Returns a new, derived table containing rows that match the template and the requested fields if any. Returns all row if template is None and all columns if fields is None. :param t: The template representing a select predicate. :param fields: The list of fields (project fields) :param limit: Max to return. Not implemented :param offset: Offset into the result. Not implemented. :return: New table containing the result of the select and project. """ if limit is not None or offset is not None: raise DataTableExceptions.DataTableException( -101, "Limit/offset not supported for CSVTable") # If there are rows and the template is not None if self.__rows__ is not None: result = [] # Add the rows that match the template to the newly created table. for r in self.__rows__: if self.matches_template(r, t): result.append(r) result = self.project(result, fields) else: result = None return result
def project(self, rows, fields): """ Perform the project. Returns a new table with only the requested columns. :param fields: A list of column names. :return: A new table derived from this table by PROJECT on the specified column names. """ try: if fields is None: # If there is not project clause, return the base table return rows # Should really return a new, identical table but am lazy. else: result = [] for r in rows: # For every row in the table. tmp = {} # Not sure why I am using range. for j in range( 0, len(fields) ): # Make a new row with just the requested columns/fields. v = r[fields[j]] tmp[fields[j]] = v else: result.append(tmp) # Insert into new table when done. return result except KeyError as ke: # happens if the requested field not in rows. raise DataTableExceptions.DataTableException( -2, "Invalid field in project")
def __load__(self): try: fn = self.__get_file_name__() with open(fn, "r") as csvfile: # CSV files can be pretty complex. You can tell from all of the options on the various readers. # The two params here indicate that "," separates columns and anything in between " ... " should parse # as a single string, even if it has things like "," in it. reader = csv.DictReader(csvfile, delimiter=",", quotechar='"') # Get the names of the columns defined for this table from the metadata. column_names = self.__get_columns_names__() # Loop through each line (well dictionary) in the input file. for r in reader: # Only add the defined columns into the in-memory table. The CSV file may contain columns # that are not relevant to the definition. self.headers = list(r.keys()) projected_r = self.project([r], column_names)[0] # print("r:",projected_r) self.__add_row__(projected_r) except IOError as e: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.invalid_file, message="Could not read file = " + fn)
def __find_by_template_index__(self, t, idx, fields=None, limit=None, offset=None): """ Find using a selected index :param t: Template representing a where clause/ :param idx: Name of index to use. :param fields: Fields to return. :param limit: Not implemented. Ignore. :param offset: Not implemented. Ignore :return: Matching tuples. """ if limit is not None or offset is not None: raise DataTableExceptions.DataTableException( -101, "Limit/offset not supported for CSVTable") result = None if self.__rows__ is not None: tmp = "" col_lst = list(idx.split("_")) for i, col in enumerate(col_lst): tmp += ("_" + t[col]) if i != 0 else t[col] if tmp in self.hashmaps[idx].keys(): result = self.hashmaps[idx][tmp] if result == {}: print("__find_by_template_index__") result = self.project(result, fields) return result
def create_table(self, table_name, file_name, column_definitions=None, primary_key_columns=None): # Determine copy table a = TableDefinition(table_name, file_name, column_definitions, primary_key_columns) previous_info = a.load_table_definition(self.cnx, table_name) if previous_info['definition'] != {}: if previous_info['definition']['name'] == table_name: raise DataTableExceptions.DataTableException( code=-101, message="Table name is duplicate") q = 'insert into definitions values ( ' + '"' + table_name + '"' + ', ' + '"' + file_name + '"' + ' )' if column_definitions: for i in column_definitions: self.examine_column_name(table_name, i.column_name) q1 = 'insert into columns values ( ' + '"' + table_name + '"' + ', ' + '"' + i.column_name + '"' + ', ' + '"' + i.column_type + '"' + ', ' + '"' + str( i.not_null) + '"' + ' )' #print("q1:",q1) self._query(q1) #self.run_q(q,None,self.cnx) self._query(q) #raise DataTableExceptions.DataTableException(code=-101,message="Table name is duplicate") return a
def examine_primary_key(self, columns): defined_columns = [] for i in self.column_definitions: defined_columns.append(i['column_name']) #print("table:",header_list) for i in columns: if i not in defined_columns: raise DataTableExceptions.DataTableException( code=-1000, message="Invalid key columns")
def __init__(self, index_name, index_type, columns): """ :param index_name: Name for index. Must be unique name for table. :param index_type: Valid index type. """ if not index_name: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.none_index_name, message="The index name cannot be None!") index_type = str.upper(index_type) if index_type not in self.index_types: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.invalid_type_name, message="The index type is invalid!") self.index_name = index_name self.index_type = index_type self.columns = columns
def __init__(self, column_name, column_type="text", not_null=False): """ :param column_name: Cannot be None. :param column_type: Must be one of valid column_types. :param not_null: True or False """ if not column_name: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.none_column_name, message="The column name cannot be None!") column_type = str.lower(column_type) if column_type not in self.column_types: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.invalid_type_name, message="The column type is invalid!") self.column_name = column_name self.column_type = column_type self.not_null = not_null
def examine_column_name(self, table_name, column_name): if table_name == 'people': if column_name not in people_header: raise DataTableExceptions.DataTableException( code=-100, message="Column " + column_name + " definition is invalid ") if table_name == 'batting': if column_name not in batting_header: raise DataTableExceptions.DataTableException( code=-100, message="Column " + column_name + " definition is invalid ") if table_name == 'teams': if column_name not in batting_header: raise DataTableExceptions.DataTableException( code=-100, message="Column " + column_name + " definition is invalid ")
def add_column_definition(self, c): """ Add a column definition. :param c: New column. Cannot be duplicate or column not in the file. :return: None """ # check valid columns in csv if c.column_name not in self.valid_columns: raise de.DataTableException( code=-100, message="Column {} definition is invald".format(c.column_name)) # check duplicate columns in table if c.column_name in self.columns: raise de.DataTableException( message="Duplicate column {} for table {}".format( c.column_name, self.table_name)) q = "insert into catalog_columns values (%s, %s, %s, %s)" is_nullable = "yes" if c.not_null else "no" # convert python boolean to str for mysql args = [self.table_name, c.column_name, is_nullable, c.column_type] run_q(self.cnx, q, args, False) self.column_definitions.append(c)
def run_q(cnx, q, args, fetch=False): cursor = cnx.cursor() try: cursor.execute(q, args) except Exception as e: raise de.DataTableException(ex=e) # cursor.execute(q, args) if fetch: result = cursor.fetchall() else: result = None cnx.commit() return result
def define_primary_key(self, columns): """ Define (or replace) primary key definition. :param columns: List of column values in order. :return: """ for col in columns: if col not in self.column_nameset: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.invalid_index, message="Index column " + col + " is invalid.") if "PRIMARY" in self.index_nameset: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.duplicate_index, message="Index " + "PRIMARY" + " is duplicate.") self.index_definitions.append( IndexDefinition("PRIMARY", "PRIMARY", columns)) self.index_nameset.add("PRIMARY") cursor = self.cnx.cursor() for i, col in enumerate(columns): cursor.execute("INSERT INTO indexes VALUES ('" + self.t_name + "','" + "PRIMARY" + "','" + col + "','" + "PRIMARY" + "'," + str(i + 1) + ");") self.cnx.commit()
def __find_by_template_index__(self, t, idx, fields=None, limit=None, offset=None): """ Find using a selected index :param t: Template representing a where clause/ :param idx: Name of index to use. :param fields: Fields to return. :param limit: Not implemented. Ignore. :param offset: Not implemented. Ignore :return: Matching tuples. """ if limit is not None or offset is not None: raise DataTableExceptions.DataTableException( -101, "Limit/offset not supported for CSVTable") # find the index index_dict = self.__indexes_dict__[idx] # generate the key for the hash index index = self.__get_index_by_name__(idx) if index is None: raise ("No such Index %s" % idx) index_columns = index.columns t_columns = [] for index_column in index_columns: t_columns.append(t[index_column]) t_columns = tuple(t_columns) # matching tuples result_rows = list() if t_columns in index_dict: index_rows = index_dict[t_columns] for index_row in index_rows: if self.matches_template(index_row, t): result_rows.append(index_row) self.project(result_rows, fields) return result_rows
def create_table(self, table_name, file_name, column_definitions=None, primary_key_columns=None): cursor = self.cnx.cursor() cursor.execute("select count(*) from tables where name='" + table_name + "';") data = cursor.fetchall()[0]['count(*)'] if data > 0: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException. duplicate_table_name, message="Table name " + table_name + " is duplicate.") id = [IndexDefinition("PRIMARY", "PRIMARY", primary_key_columns) ] if primary_key_columns else [] column_definitions = [] if not column_definitions else column_definitions return TableDefinition(table_name, file_name, column_definitions, id, self.cnx)
def define_index(self, index_name, columns, kind="INDEX"): """ Define or replace and index definition. :param index_name: Index name, must be unique within a table. :param columns: Valid list of columns. :param kind: One of the valid index types. :return: """ table_columns = [cd.column_name for cd in self.column_definitions] if not set(columns).issubset(set(table_columns)): raise de.DataTableException( code=-1000, message="Key references an undefined column") i = 0 for c_n in columns: q = "insert into catalog_indices values (%s, %s, %s, %s, %s)" # arg can be int args = [index_name, self.table_name, c_n, i, kind] run_q(self.cnx, q, args, False) i = i + 1 self.index_definitions.append( IndexDefinition(index_name, kind, columns))
def find_by_template(self, t, fields=None, limit=None, offset=None): # 1. Validate the template values relative to the defined columns. # 2. Determine if there is an applicable index, and call __find_by_template_index__ if one exists. # 3. Call __find_by_template_scan__ if not applicable index. if t: df_columns = self.__get_column_names__() for k in t: if k not in df_columns: raise DataTableExceptions.DataTableException( code=-102, message= "template values not relative to defined columns") r = self.__get_access_path__(t) if r: return self.__find_by_template_index__(t, r, fields) else: return self.__find_by_template_scan__(t, fields) else: return self.__rows__
def drop_index(self, index_name): """ Remove an index. :param index_name: Name of index to remove. :return: """ if index_name not in self.index_nameset: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.invalid_index, message=index_name + " is not in the index definition list.") removal = None for ids in self.index_definitions: if ids.index_name == index_name: removal = ids break self.index_definitions.remove(removal) self.index_nameset.remove(index_name) cursor = self.cnx.cursor() cursor.execute("delete from indexes where index_name = '" + index_name + "';") self.cnx.commit()
def __find_by_template_index__(self, t, idx, fields=None, limit=None, offset=None): """ Find using a selected index :param t: Template representing a where clause/ :param idx: Name of index to use. :param fields: Fields to return. :param limit: Not implemented. Ignore. :param offset: Not implemented. Ignore :return: Matching tuples. """ if limit is not None or offset is not None: raise DataTableExceptions.DataTableException( -101, "Limit/offset not supported for CSVTable") idx_columns = idx.split("_") idx_v = [t[field] for field in idx_columns] idx_string = "_".join(list(map(str, idx_v))) if self.__rows__ is not None: result = [] # for name in idx_columns: # t.pop(name) for r in self.index_info[idx][idx_string]: if self.matches_template(r, t): result.append(r) result = self.project(result, fields) else: result = None return result
def __init__(self, t_name=None, csv_f=None, column_definitions=None, index_definitions=None, cnx=None, load=False): """ :param t_name: Name of the table. :param csv_f: Full path to a CSV file holding the data. :param column_definitions: List of column definitions to use from file. Cannot contain invalid column name. May be just a subset of the columns. :param index_definitions: List of index definitions. Column names must be valid. :param cnx: Database connection to use. If None, create a default connection. """ if not t_name: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.none_table_name, message="The table name cannot be None!") if not csv_f: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.none_path, message="The CSV file path cannot be None!") try: open(csv_f, 'r') except: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.none_path, message="The CSV file path is invalid!") if not cnx: cnx = pymysql.connect(host='localhost', port=3306, user='******', password='******', db='CSVCatalog', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) self.t_name = t_name self.csv_f = csv_f self.column_nameset = set() with open(self.csv_f, 'r') as csvfile: reader = csv.DictReader(csvfile) self.columns = reader.fieldnames self.column_definitions = column_definitions if column_definitions else [] for cd in self.column_definitions: if cd.column_name not in self.columns: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException. invalid_column_definition, message="Column " + cd.column_name + " is invalid.") self.column_nameset.add(cd.column_name) self.index_definitions = index_definitions if index_definitions else [] self.index_nameset = set() for id in self.index_definitions: for col in id.columns: if col not in self.column_nameset: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException. invalid_index, message="Index column " + id.index_name + " is invalid.") if id.index_name in self.index_nameset: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException. duplicate_index, message="Index " + id.index_name + " is duplicate.") self.index_nameset.add(id.index_name) self.cnx = cnx if not load: cursor = self.cnx.cursor() cursor.execute("INSERT INTO tables VALUES ('" + self.t_name + "','" + self.csv_f + "');") self.cnx.commit() for cd in self.column_definitions: cursor.execute("INSERT INTO columns VALUES ('" + self.t_name + "','" + cd.column_name + "','" + cd.column_type + "'," + ("TRUE" if cd.not_null else "FALSE") + ");") self.cnx.commit() for id in self.index_definitions: for i, col in id.columns: cursor.execute("INSERT INTO indexes VALUES ('" + self.t_name + "','" + id.index_name + "','" + col + "','" + id.index_type + "','" + str(i) + ");") self.cnx.commit()
def insert(self, r): raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.not_implemented, message="Insert not implemented")
def delete(self, t): raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.not_implemented, message="Delete not implemented")
def update(self, t, change_values): raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.not_implemented, message="Updated not implemented")
def __init__(self, t_name=None, csv_f=None, column_definitions=None, index_definitions=None, cnx=None, load=False): """ :param t_name: Name of the table. :param csv_f: Full path to a CSV file holding the data. :param column_definitions: List of column definitions to use from file. Cannot contain invalid column name. May be just a subset of the columns. :param index_definitions: List of index definitions. Column names must be valid. :param cnx: Database connection to use. If None, create a default connection. :param load: construct a table definition based on existing metadat in db """ assert t_name is not None assert csv_f is not None if not load: self.table_name = t_name self.csv_f = csv_f self.column_definitions = [] self.index_definitions = [] self.cnx = cnx # construct valid columns based on csv file csv.register_dialect('myDialect', delimiter=',', skipinitialspace=True) with open(self.csv_f, 'r') as csvFile: reader = csv.reader(csvFile, dialect='myDialect') for row in reader: self.valid_columns = row break # test if table_name already exists q = "select table_name from catalog_tables" result = run_q(self.cnx, q, None, True) existed_tbnms = [r['table_name'] for r in result] if self.table_name in existed_tbnms: raise de.DataTableException( code=-101, message="Table name {} is duplicate".format( self.table_name)) ## add table metadata q = "insert into catalog_tables values (%s, %s)" args = [t_name, csv_f] run_q(self.cnx, q, args, False) ## add column metadata if column_definitions is not None: for c in column_definitions: self.add_column_definition(c) ## add index metadata if index_definitions is not None: for i in index_definitions: self.define_index(i.index_name, i.columns, i.index_type) else: # load an existing table # check if the table name exists q = "select table_name from catalog_tables" result = run_q(cnx, q, None, True) existed_tbnms = [r['table_name'] for r in result] if t_name not in existed_tbnms: raise de.DataTableException( message="Table name {} does not exists".format( self.table_name)) self.table_name = t_name self.csv_f = csv_f self.column_definitions = column_definitions if column_definitions is not None else [] self.index_definitions = index_definitions if index_definitions is not None else [] self.cnx = cnx # construct valid columns based on csv file csv.register_dialect('myDialect', delimiter=',', skipinitialspace=True) with open(self.csv_f, 'r') as csvFile: reader = csv.reader(csvFile, dialect='myDialect') for row in reader: self.valid_columns = row break
def load_table_definition(cls, cnx, table_name): """ :param cnx: Connection to use to load definition. :param table_name: Name of table to load. :return: Table and all sub-data. Read from the database tables holding catalog information. """ if not cnx: cnx = pymysql.connect(host='localhost', port=3306, user='******', password='******', db='CSVCatalog', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) if not table_name: raise DataTableExceptions.DataTableException( code=DataTableExceptions.DataTableException.none_table_name, message="The table name cannot be None!") d = {} cursor = cnx.cursor() # definition part cursor.execute("select * from tables where name='" + table_name + "';") definition = cursor.fetchone() if definition: path = definition['path'] else: path = "../data/" + str.capitalize( table_name ) + ".csv" # Sorry I have to do this trick here, but there's always an error when fetching definition of "Appearances.csv" (see my post @847) # columns part cnx.commit() cursor.execute("select * from columns where table_name='" + table_name + "';") columns = cursor.fetchall() column_definitions = [] for col in columns: column_definitions.append( ColumnDefinition(col["column_name"], col["column_type"], col["not_null"] == 1)) # indexes part cnx.commit() cursor.execute("select * from indexes where table_name='" + table_name + "';") indexes = cursor.fetchall() index_definitions = [] idx_cols = {} for idx in indexes: if idx["index_name"] in idx_cols.keys(): idx_cols[idx["index_name"]]["cols"].append(idx["column"]) else: idx_cols[idx["index_name"]] = { "kind": idx["kind"], "cols": [idx["column"]] } for idx in idx_cols: index_definitions.append( IndexDefinition(idx, idx_cols[idx]["kind"], idx_cols[idx]["cols"])) load_table = TableDefinition(table_name, path, column_definitions, index_definitions, cnx, True) cnx.commit() return load_table