def ImportDataTable(calculation_object, dataset_id, tableid, folder, import_settings): global tableOrder, property_order with calculation_object.LogHeader('Importing 2D datatable {0}'.format(tableid)): print('Source: ' + folder) DQXUtils.CheckValidIdentifier(tableid) table_settings = SettingsLoader.SettingsLoader(os.path.join(os.path.join(folder, 'settings'))) table_settings.RequireTokens(['NameSingle', 'NamePlural', 'FirstArrayDimension']) table_settings.AddTokenIfMissing('ShowInGenomeBrowser', False) table_settings.AddTokenIfMissing('ColumnDataTable', '') table_settings.AddTokenIfMissing('RowDataTable', '') extra_settings = table_settings.Clone() extra_settings.DropTokens(['ColumnDataTable', 'ColumnIndexField', 'RowDataTable', 'RowIndexField', 'Properties']) remote_hdf5 = h5py.File(os.path.join(folder, 'data.hdf5'), 'r') #Check that the referenced tables exist and have the primary key specified. if table_settings['ColumnDataTable']: sql = "SELECT id FROM tablecatalog WHERE id = '{0}'".format(table_settings['ColumnDataTable']) id = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql) try: id = id[0][0] except IndexError: raise Exception("Index Table " + table_settings['ColumnDataTable'] + " doesn't exist") sql = "SELECT {0} FROM {1} LIMIT 1".format(table_settings['ColumnIndexField'], table_settings['ColumnDataTable']) try: field = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql) except: raise Exception(table_settings['ColumnIndexField'] + " column index field doesn't exist in table " + table_settings['ColumnDataTable']) if table_settings['RowDataTable']: sql = "SELECT id FROM tablecatalog WHERE id = '{0}'".format(table_settings['RowDataTable']) id = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql) try: id = id[0][0] except IndexError: raise Exception("Index Table " + table_settings['RowDataTable'] + " doesn't exist") sql = "SELECT {0} FROM {1} LIMIT 1".format(table_settings['RowIndexField'], table_settings['RowDataTable']) try: field = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql) except: raise Exception(table_settings['RowIndexField'] + " row index field doesn't exist in table " + table_settings['RowDataTable']) if table_settings['ShowInGenomeBrowser']: sql = "SELECT IsPositionOnGenome FROM tablecatalog WHERE id='{0}' ".format(table_settings['ColumnDataTable']) is_position = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)[0][0] if not is_position: raise Exception(table_settings['ColumnDataTable'] + ' is not a genomic position based table (IsPositionOnGenome in config), but you have asked to use this table as a column index on a genome browseable 2D array.') if table_settings['FirstArrayDimension'] not in ['column', 'row']: raise Exception("FirstArrayDimension must be column or row") # Add to tablecatalog extra_settings.ConvertStringsToSafeSQL() sql = "INSERT INTO 2D_tablecatalog VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}', {6})".format( tableid, table_settings['NamePlural'], table_settings['ColumnDataTable'], table_settings['RowDataTable'], table_settings['FirstArrayDimension'], extra_settings.ToJSON(), tableOrder ) ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql) tableOrder += 1 for property in table_settings['Properties']: extra_settings = copy.deepcopy(property) dtype = arraybuffer._strict_dtype_string(remote_hdf5[property['Id']].dtype) del extra_settings['Id'] del extra_settings['Name'] sql = "INSERT INTO 2D_propertycatalog VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', {5}, '{6}', '{7}')".format( property['Id'], tableid, table_settings['ColumnDataTable'], table_settings['RowDataTable'], property['Name'], property_order, dtype, simplejson.dumps(extra_settings) ) ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql) property_order += 1 if not import_settings['ConfigOnly']: #Insert an index column into the index tables if table_settings['ColumnDataTable']: if table_settings.HasToken('ColumnIndexArray'): #We have an array that matches to a column in the 1D SQL, we add an index to the 1D SQL #Firstly create a temporay table with the index array try: column_index = remote_hdf5[table_settings['ColumnIndexArray']] except KeyError: raise Exception("HDF5 doesn't contain {0} at the root".format(table_settings['ColumnIndexArray'])) for property in table_settings['Properties']: if len(column_index) != remote_hdf5[property['Id']].shape[0 if table_settings['FirstArrayDimension'] == 'column' else 1]: raise Exception("Property {0} has a different column length to the column index".format(property)) sql = ImpUtils.Numpy_to_SQL().create_table('TempColIndex', table_settings['ColumnIndexField'], column_index) ImpUtils.ExecuteSQLGenerator(calculation_object, dataset_id, sql) #We have a datatable - add an index to it then copy that index across to the data table sql = """ALTER TABLE `TempColIndex` ADD `index` INT DEFAULT NULL; SELECT @i:=-1;UPDATE `TempColIndex` SET `index` = @i:=@i+1; ALTER TABLE `{0}` ADD `{2}_column_index` INT DEFAULT NULL; UPDATE `{0}` INNER JOIN `TempColIndex` ON `{0}`.`{1}` = `TempColIndex`.`{1}` SET `{0}`.`{2}_column_index` = `TempColIndex`.`index`; DROP TABLE `TempColIndex`""".format( table_settings['ColumnDataTable'], table_settings['ColumnIndexField'], tableid) ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql) #Now check we have no NULLS sql = "SELECT `{1}_column_index` from `{0}` where `{1}_column_index` IS NULL".format( table_settings['ColumnDataTable'], tableid) nulls = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql) if len(nulls) > 0: raise Exception("Not all rows in {0} have a corresponding column in 2D datatable {1}".format(table_settings['ColumnDataTable'], tableid)) else: #We don't have an array of keys into a column so we are being told the data in HDF5 is in the same order as sorted "ColumnIndexField" so we index by that column in order sql = """ALTER TABLE `{0}` ADD `{2}_column_index` INT DEFAULT NULL; SELECT @i:=-1;UPDATE `{0}` SET `{2}_column_index` = @i:=@i+1 ORDER BY `{1}`; """.format( table_settings['ColumnDataTable'], table_settings['ColumnIndexField'], tableid) ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql) if table_settings['RowDataTable']: if table_settings.HasToken('RowIndexArray'): #We have an array that matches to a column in the 1D SQL, we add an index to the 1D SQL #Firstly create a temporay table with the index array try: row_index = remote_hdf5[table_settings['RowIndexArray']] except KeyError: raise Exception("HDF5 doesn't contain {0} at the root".format(table_settings['RowIndexArray'])) for property in table_settings['Properties']: if len(row_index) != remote_hdf5[property['Id']].shape[0 if table_settings['FirstArrayDimension'] == 'row' else 1]: raise Exception("Property {0} has a different row length to the row index".format(property)) sql = ImpUtils.Numpy_to_SQL().create_table('TempRowIndex', table_settings['RowIndexField'], row_index) ImpUtils.ExecuteSQLGenerator(calculation_object, dataset_id, sql) #We have a datatable - add an index to it then copy that index across to the data table sql = """ALTER TABLE `TempRowIndex` ADD `index` INT DEFAULT NULL; SELECT @i:=-1;UPDATE `TempRowIndex` SET `index` = @i:=@i+1; ALTER TABLE `{0}` ADD `{2}_row_index` INT DEFAULT NULL; UPDATE `{0}` INNER JOIN `TempRowIndex` ON `{0}`.`{1}` = `TempRowIndex`.`{1}` SET `{0}`.`{2}_row_index` = `TempRowIndex`.`index`; DROP TABLE `TempRowIndex`""".format( table_settings['RowDataTable'], table_settings['RowIndexField'], tableid) ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql) #Now check we have no NULLS sql = "SELECT `{1}_row_index` from `{0}` where `{1}_row_index` IS NULL".format( table_settings['RowDataTable'], tableid) nulls = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql) if len(nulls) > 0: raise Exception("Not all rows in {0} have a corresponding row in 2D datatable {1}".format(table_settings['RowDataTable'], tableid)) else: #We don't have an array of keys into a column so we are being told the data in HDF5 is in the same order as sorted "RowIndexField" so we index by that column in order sql = """ALTER TABLE `{0}` ADD `{2}_row_index` INT DEFAULT NULL; SELECT @i:=-1;UPDATE `{0}` SET `{2}_row_index` = @i:=@i+1 ORDER BY `{1}`; """.format( table_settings['RowDataTable'], table_settings['RowIndexField'], tableid) ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql) #We have the indexes - now we need a local copy of the HDF5 data for each property ImpUtils.mkdir(os.path.join(config.BASEDIR, '2D_data')) local_hdf5 = h5py.File(os.path.join(config.BASEDIR, '2D_data', dataset_id+'_'+tableid+'.hdf5'), 'w', libver='latest') print "Copying HDF5 datasets" for property in table_settings['Properties']: print "..", property local_hdf5.copy(remote_hdf5[property['Id']], property['Id']) print "done" print "all copies complete" local_hdf5.close() remote_hdf5.close()
def ImportDataTable(self, tableid): with self._calculationObject.LogHeader('Importing 2D datatable {0}'.format(tableid)): DQXUtils.CheckValidTableIdentifier(tableid) self._calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(self._datasetId, '2D_tablecatalog')) self._calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(self._datasetId, '2D_propertycatalog')) max_line_count = None if self._maxLineCount > 0: max_line_count = self._maxLineCount table_settings = self.getSettings(tableid) settingsFile, dataFile = self._getDataFiles(tableid) remote_hdf5 = h5py.File(dataFile, 'r') #Check that the referenced tables exist and have the primary key specified. if table_settings['ColumnDataTable']: tables = self._getTablesInfo(table_settings['ColumnDataTable']) cat_id = tables[0]["id"] sql = "SELECT {0} FROM {1} LIMIT 1".format(table_settings['ColumnIndexField'], table_settings['ColumnDataTable']) try: idx_field = self._execSqlQuery(sql) except: raise Exception(table_settings['ColumnIndexField'] + " column index field doesn't exist in table " + table_settings['ColumnDataTable']) if table_settings['RowDataTable']: tables = self._getTablesInfo(table_settings['RowDataTable']) cat_id = tables[0]["id"] sql = "SELECT {0} FROM {1} LIMIT 1".format(table_settings['RowIndexField'], table_settings['RowDataTable']) try: idx_field = self._execSqlQuery(sql) except: raise Exception(table_settings['RowIndexField'] + " row index field doesn't exist in table " + table_settings['RowDataTable']) if table_settings['ShowInGenomeBrowser']: sql = "SELECT IsPositionOnGenome FROM tablecatalog WHERE id='{0}' ".format(table_settings['ColumnDataTable']) is_position = self._execSqlQuery(sql)[0][0] if not is_position: raise Exception(table_settings['ColumnDataTable'] + ' is not a genomic position based table (IsPositionOnGenome in config), but you have asked to use this table as a column index on a genome browseable 2D array.') if table_settings['FirstArrayDimension'] not in ['column', 'row']: raise Exception("FirstArrayDimension must be column or row") # Add to tablecatalog sql = "INSERT INTO 2D_tablecatalog VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}', {6})".format( tableid, table_settings['NamePlural'], table_settings['ColumnDataTable'], table_settings['RowDataTable'], table_settings['FirstArrayDimension'], table_settings.serialize(), self.tableOrder ) self._execSql(sql) self.tableOrder += 1 for propname in table_settings.getPropertyNames(): propid = table_settings.getPropertyValue(propname,'Id') dtype = arraybuffer._strict_dtype_string(remote_hdf5[propid].dtype) arity = 1 if len(remote_hdf5[propid].shape) == 2 else remote_hdf5[propid].shape[2] sql = "INSERT INTO 2D_propertycatalog VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', {5}, '{6}', '{7}', {8})".format( propid, tableid, table_settings['ColumnDataTable'], table_settings['RowDataTable'], table_settings.getPropertyValue(propname,'Name'), self.property_order, dtype, table_settings.serializeProperty(propname), arity ) self._execSql(sql) self.property_order += 1 if not self._importSettings['ConfigOnly']: #Insert an index column into the index tables if table_settings['ColumnDataTable']: # Assume that index field has been created on import in LoadTable - it's much faster # We could just run the command and ignore the error raised if it already exists # sql = "ALTER TABLE `{0}` ADD `{1}_column_index` INT DEFAULT NULL;".format(table_settings['ColumnDataTable'], tableid) # self._execSql(sql) if table_settings['ColumnIndexArray']: #We have an array that matches to a column in the 1D SQL, we add an index to the 1D SQL #Firstly create a temporary table with the index array try: column_index = remote_hdf5[table_settings['ColumnIndexArray']] except KeyError: raise Exception("HDF5 doesn't contain {0} at the root".format(table_settings['ColumnIndexArray'])) for prop in table_settings['Properties']: if len(column_index) != remote_hdf5[prop['Id']].shape[0 if table_settings['FirstArrayDimension'] == 'column' else 1]: raise Exception("Property {0} has a different column length to the column index".format(property)) #TempColIndex should really be a TEMPORARY table self._dropTable('`TempColIndex`') sql = ImpUtils.Numpy_to_SQL().create_table('TempColIndex', table_settings['ColumnIndexField'], column_index[0:max_line_count]) ImpUtils.ExecuteSQLGenerator(self._calculationObject, self._datasetId, sql) #Add an index to the table - catch the exception if it exists. sql = "ALTER TABLE `{0}` ADD `{2}_column_index` INT DEFAULT NULL;".format( table_settings['ColumnDataTable'], table_settings['ColumnIndexField'], tableid) try: self._execSql(sql) except OperationalError as e: if e[0] != 1060: raise e # We have a datatable - add an index to it then copy that index across to the data table sql = """ALTER TABLE `TempColIndex` ADD `index` INT DEFAULT NULL; SELECT @i:=-1;UPDATE `TempColIndex` SET `index` = @i:=@i+1; UPDATE `{0}` INNER JOIN `TempColIndex` ON `{0}`.`{1}` = `TempColIndex`.`{1}` SET `{0}`.`{2}_column_index` = `TempColIndex`.`index`; """.format( table_settings['ColumnDataTable'], table_settings['ColumnIndexField'], tableid) self._execSql(sql) self._dropTable('`TempColIndex`') #Now check we have no NULLS sql = "SELECT `{1}_column_index` from `{0}` where `{1}_column_index` IS NULL".format( table_settings['ColumnDataTable'], tableid) nulls = self._execSqlQuery(sql) if len(nulls) > 0: print("WARNING:Not all rows in {0} have a corresponding column in 2D datatable {1}".format(table_settings['ColumnDataTable'], tableid)) else: #Add an index to the table - catch the exception if it exists. sql = "ALTER TABLE `{0}` ADD `{2}_column_index` INT DEFAULT NULL;".format( table_settings['ColumnDataTable'], table_settings['ColumnIndexField'], tableid) try: self._execSql(sql) except OperationalError as e: if e[0] != 1060: raise e #We don't have an array of keys into a column so we are being told the data in HDF5 is in the same order as sorted "ColumnIndexField" so we index by that column in order if max_line_count: sql = "SELECT @i:=-1;UPDATE `{0}` SET `{2}_column_index` = @i:=@i+1 ORDER BY `{1}` LIMIT {3};" else: sql = "SELECT @i:=-1;UPDATE `{0}` SET `{2}_column_index` = @i:=@i+1 ORDER BY `{1}`;" sql = sql.format( table_settings['ColumnDataTable'], table_settings['ColumnIndexField'], tableid, max_line_count) self._execSql(sql) if table_settings['RowDataTable']: #Add an index to the table - catch the exception if it exists. sql = "ALTER TABLE `{0}` ADD `{2}_row_index` INT DEFAULT NULL;".format( table_settings['RowDataTable'], table_settings['RowIndexField'], tableid) try: self._execSql(sql) except OperationalError as e: if e[0] != 1060: raise e if table_settings['RowIndexArray']: #We have an array that matches to a column in the 1D SQL, we add an index to the 1D SQL #Firstly create a temporay table with the index array try: row_index = remote_hdf5[table_settings['RowIndexArray']] except KeyError: raise Exception("HDF5 doesn't contain {0} at the root".format(table_settings['RowIndexArray'])) for prop in table_settings['Properties']: if len(row_index) != remote_hdf5[prop['Id']].shape[0 if table_settings['FirstArrayDimension'] == 'row' else 1]: raise Exception("Property {0} has a different row length to the row index".format(property)) self._dropTable('`TempRowIndex`') sql = ImpUtils.Numpy_to_SQL().create_table('TempRowIndex', table_settings['RowIndexField'], row_index) ImpUtils.ExecuteSQLGenerator(self._calculationObject, self._datasetId, sql) #We have a datatable - add an index to it then copy that index across to the data table sql = """ALTER TABLE `TempRowIndex` ADD `index` INT DEFAULT NULL; SELECT @i:=-1;UPDATE `TempRowIndex` SET `index` = @i:=@i+1; UPDATE `{0}` INNER JOIN `TempRowIndex` ON `{0}`.`{1}` = `TempRowIndex`.`{1}` SET `{0}`.`{2}_row_index` = `TempRowIndex`.`index`; """.format( table_settings['RowDataTable'], table_settings['RowIndexField'], tableid) self._execSql(sql) self._dropTable('`TempRowIndex`') #Now check we have no NULLS sql = "SELECT `{1}_row_index` from `{0}` where `{1}_row_index` IS NULL".format( table_settings['RowDataTable'], tableid) nulls = self._execSqlQuery(sql) if len(nulls) > 0: print("WARNING: Not all rows in {0} have a corresponding row in 2D datatable {1}".format(table_settings['RowDataTable'], tableid)) else: #We don't have an array of keys into a column so we are being told the data in HDF5 is in the same order as sorted "RowIndexField" so we index by that column in order sql = """SELECT @i:=-1;UPDATE `{0}` SET `{2}_row_index` = @i:=@i+1 ORDER BY `{1}`; """.format( table_settings['RowDataTable'], table_settings['RowIndexField'], tableid) self._execSql(sql) #We have the indexes - now we need a local copy of the HDF5 data for each property ImpUtils.mkdir(os.path.join(config.BASEDIR, '2D_data')) path_join = os.path.join(config.BASEDIR, '2D_data', self._datasetId + '_' + tableid + '.hdf5') try: os.remove(path_join) except OSError: pass if table_settings['SymlinkData']: print "Symlinking datasets - will only work on unix" os.symlink(dataFile, path_join) else: local_hdf5 = h5py.File(path_join, 'w', libver='latest') print "Copying HDF5 datasets" for prop in table_settings['Properties']: print "..", prop prop_in = remote_hdf5[prop['Id']] #Make some choices assuming data is variants/samples if prop_in.shape[0] > prop_in.shape[1]: chunks = [min(1000, prop_in.shape[0]), min(10, prop_in.shape[1])] else: chunks = [min(10, prop_in.shape[0]), min(1000, prop_in.shape[1])] arity = 1 if len(prop_in.shape) == 2 else prop_in.shape[2] if arity > 1: chunks.append(arity) prop_out = local_hdf5.create_dataset(prop['Id'], prop_in.shape, prop_in.dtype, chunks=tuple(chunks), maxshape=prop_in.shape, compression='gzip', fletcher32=False, shuffle=False) self._hdf5_copy(prop_in, prop_out, limit=(None, max_line_count) if table_settings['FirstArrayDimension'] == 'row' else (max_line_count, None)) print "done" print "all copies complete" local_hdf5.close() remote_hdf5.close()