def deleteGenomeList(self, genome_list_ids): """Delete genome list and associated genomes. Parameters ---------- genome_list_ids : iterable Unique identifier of genome lists in database. Returns ------- bool True if successful. """ for genome_list_id in genome_list_ids: try: edit_permission = self.permissionToModify(genome_list_id) if edit_permission is False: raise GenomeDatabaseError( "Insufficient permissions to delete genome list. Offending list id: {0}" .format(genome_list_id)) if not confirm( "Are you sure you want to delete {0} lists (this action cannot be undone)" .format(len(genome_list_ids))): raise GenomeDatabaseError("User aborted database action.") list_genomes_ids = self.getGenomeIdsFromGenomeListIds( [genome_list_id]) self.editGenomeList(genome_list_id, list_genomes_ids, 'remove') except GenomeDatabaseError as e: raise e return True
def deleteMarkerSets(self, marker_set_ids): """Delete marker set and associated markers. Parameters ---------- marker_set_ids : iterable Unique identifier of marker sets in database. Returns ------- bool True if successful. """ for marker_set_id in marker_set_ids: try: edit_permission = self.permissionToModify(marker_set_id) if edit_permission is False: raise GenomeDatabaseError( "Insufficient permissions to delete marker set. Offending marker set id: {0}" .format(marker_set_id)) if not confirm( "Are you sure you want to delete {0} set(s) (this action cannot be undone)" .format(len(marker_set_ids))): raise GenomeDatabaseError("User aborted database action.") list_marker_ids = self.getMarkerIdsFromMarkerSetIds( [marker_set_id]) self.editMarkerSet(marker_set_id, list_marker_ids, 'remove') except GenomeDatabaseError as e: raise e return True
def getGenomeIdsFromGenomeListIds(self, genome_list_ids): ''' Function: getGenomeIdsFromGenomeListIds Given a list of ids, return all the ids of the genomes contained :param genome_list_ids: A list of genome list ids whose contents needs to be retrieved. Returns: A list of all the genome ids contained within the specified genome list(s), False on failure. ''' try: temp_table_name = generateTempTableName() if genome_list_ids: try: self.cur.execute("CREATE TEMP TABLE %s (id integer)" % (temp_table_name, )) query = "INSERT INTO {0} (id) VALUES (%s)".format( temp_table_name) self.cur.executemany( query, [(genome_list_id, ) for genome_list_id in genome_list_ids]) except: raise GenomeDatabaseError( "Unable to process genome list ID: %s" % str(genome_list_id)) else: raise GenomeDatabaseError( "No genome lists given. Cannot retrieve IDs") # Find any ids that don't have genome lists query = ("SELECT id FROM {0} " + "WHERE id NOT IN ( " + "SELECT id " + "FROM genome_lists)").format(temp_table_name) self.cur.execute(query) missing_list_ids = [] for (list_id, ) in self.cur: missing_list_ids.append(list_id) if missing_list_ids: raise GenomeDatabaseError( "Unknown genome list id(s) given. %s" % str(missing_list_ids)) # Find any genome list ids that we dont have permission to view self.cur.execute( "SELECT genome_id " + "FROM genome_list_contents " + "WHERE list_id in %s", (tuple(genome_list_ids), )) except GenomeDatabaseError as e: raise e return [genome_id for (genome_id, ) in self.cur.fetchall()]
def moveGenomes(self, db_genome_ids): """Move genome files into database directory structure. This function assumes addGenomes() has been called. It is not directly called by addGenomes() as all database queries are performed before moving genomes. Parameters ---------- db_genome_ids : list Unique database identifiers for genomes. """ assert(self.tmp_output_dir) # get database genome identifiers self.cur.execute("SELECT genomes.id,user_editable, external_id_prefix || '_' || id_at_source as external_id " + "FROM genomes, genome_sources " + "WHERE genome_source_id = genome_sources.id " + "AND genomes.id in %s", (tuple(db_genome_ids),)) external_id_dict = {} for (genome_id, user_editable, external_id) in self.cur: if user_editable: external_id_dict[genome_id] = external_id if len(external_id_dict.keys()) > 0: username = None if self.currentUser.isRootUser(): username = self.currentUser.getElevatedFromUsername() else: username = self.currentUser.getUsername() if username is None: raise GenomeDatabaseError( "Unable to determine user to add genomes under.") gtdb_target_dir = os.path.join(self.genomeCopyDir, username) for db_genome_id, external_id in external_id_dict.items(): tmp_genome_dir = os.path.join(self.tmp_output_dir, external_id) genome_target_dir = os.path.join(gtdb_target_dir, external_id) if os.path.exists(genome_target_dir): raise GenomeDatabaseError( "Genome directory already exists: %s" % genome_target_dir) shutil.move(tmp_genome_dir, genome_target_dir) self.cur.execute("UPDATE genomes SET fasta_file_location = %s , genes_file_location = %s , genes_file_sha256 = %s WHERE id = %s", ( os.path.join( username, external_id, external_id + self.genomeFileSuffix), os.path.join( username, external_id, self.userAnnotationDir, external_id + self.proteinFileSuffix), sha256(os.path.join(genome_target_dir, self.userAnnotationDir, external_id + self.proteinFileSuffix)), db_genome_id)) shutil.rmtree(self.tmp_output_dir)
def addGenomeList(self, genome_id_list, name, description, owner_id=None, private=None): """Creates a new genome list in the database. Parameters ---------- genome_id_list : list A list of genome ids to add to the new list. name : str Name of the newly created list. description : str Description of the newly created list. owner_id : str The id of the user who will own this list. private : bool Denotes whether this list is public or private. Returns ------- int Database identifier of newly created genome list. """ try: if (owner_id is None): if not self.currentUser.isRootUser(): raise GenomeDatabaseError( "Only the root user can create root owned lists.") else: if (not self.currentUser.isRootUser()) and ( self.currentUser.getUserId() != owner_id): raise GenomeDatabaseError( "Only the root user may create lists on behalf of other people." ) query = "INSERT INTO genome_lists (name, description, owned_by_root, owner_id, private) VALUES (%s, %s, %s, %s, %s) RETURNING id" self.cur.execute( query, (name, description, owner_id is None, owner_id, private)) (genome_list_id, ) = self.cur.fetchone() query = "INSERT INTO genome_list_contents (list_id, genome_id) VALUES (%s, %s)" self.cur.executemany(query, [(genome_list_id, x) for x in genome_id_list]) except GenomeDatabaseError as e: raise e return genome_list_id
def _addGenomesWorker(self, genomic_files, file_paths, checkm_results_dict, study_id, out_q, progress_queue): ''' The worker function, invoked in a process. :param genomic_files: dictionary {genome_id:{checkm_bin_id:value,aa_gene_path:value,fasta_path:value}} :param file_paths : dictionary generated from Prodigal :param checkm_results_dict: dictionary of checkm results :param study_id = study id ''' metadata_mngr = MetadataManager(self.cur, self.currentUser) for db_genome_id, values in genomic_files.iteritems(): self.cur.execute("UPDATE genomes SET study_id = %s WHERE id = %s", (study_id, db_genome_id)) genome_file_paths = file_paths[db_genome_id] output_dir, _file = os.path.split(genome_file_paths["fasta_path"]) bin_id = values['checkm_bin_id'] if bin_id not in checkm_results_dict: raise GenomeDatabaseError( "Couldn't find CheckM result for bin %s." % bin_id) metadata_mngr.addMetadata(db_genome_id, genome_file_paths["fasta_path"], genome_file_paths["gff_path"], checkm_results_dict[bin_id], output_dir) progress_queue.put(bin_id) out_q.put("True") return True
def ExportGenomePaths(self, path): ''' Function: ExportGenomePaths Export the full path for all genomes to a csv file :param path: Path to the output file ''' try: query_tmp = "SELECT id_at_source,external_id_prefix,fasta_file_location FROM genomes,genome_sources WHERE genomes.genome_source_id=genome_sources.id;" self.cur.execute(query_tmp) with open(path, "w") as f: for (id, prefix, file_location) in self.cur: dir_prefix = None if prefix == 'U': dir_prefix = Config.GTDB_GENOME_USR_DIR elif prefix == 'RS': dir_prefix = Config.GTDB_GENOME_RSQ_DIR elif prefix == 'GB': dir_prefix = Config.GTDB_GENOME_GBK_DIR else: raise GenomeDatabaseError( "Unrecognized database prefix: %s" % prefix) f.write("{0}\t{1}\n".format( prefix + "_" + id, os.path.dirname(os.path.join(dir_prefix, file_location)))) print "Export Successful" except GenomeDatabaseError as e: raise e
def getMarkerIdsFromMarkerSetIds(self, marker_set_ids): """Get marker identifiers within specific marker set. Parameters ---------- marker_set_ids : iterable Identifiers of marker sets. Returns ------- list Identifier of markers in marker sets. """ self.cur.execute( "SELECT id, owner_id, owned_by_root, private " + "FROM marker_sets " + "WHERE id in %s ", (tuple(marker_set_ids), )) result = self.cur.fetchone() if not result: raise GenomeDatabaseError( "At least one marker set is invalid: %s" % str(marker_set_ids)) self.cur.execute( "SELECT marker_id " + "FROM marker_set_contents " + "WHERE set_id in %s ", (tuple(marker_set_ids), )) return [marker_id for (marker_id, ) in self.cur.fetchall()]
def importMetadata(self, table=None, field=None, typemeta=None, metafile=None): ''' Function importMetadata import one field of Metadata for a list of Genomes :param table: Table where the column is located :param field: Name of the Column :param typemeta: Data type of the column :param metafile: TSV file with the format (Genome_id \t Value) ''' try: data_list = [] with open(metafile, 'r') as metaf: for line in metaf: data_list.append(tuple(line.strip().split('\t'))) data_zip = zip(*data_list) genome_id = list(data_zip[0]) meta_value = list(data_zip[1]) for n, i in enumerate(genome_id): new_i = i.split("_", 1)[1] genome_id[n] = new_i query = "SELECT upsert('{0}','{1}','{2}',%s,%s)".format( table, field, typemeta) self.cur.execute(query, (genome_id, meta_value)) except GenomeDatabaseError as e: raise e except psycopg2.Error as e: raise GenomeDatabaseError(e.pgerror)
def printGenomeStats(self, genome_id_list, stat_fields): """Print statistics details of genomes. Parameters ---------- genome_id_list : iterable Unique identifier of genomes in database. Returns ------- list Column headers. list Content for each row. """ try: if not genome_id_list: raise GenomeDatabaseError( "Unable to print genomes. No genomes found.") stat_fields = ['id', 'accession'] + stat_fields stat_fields_str = ','.join(stat_fields) self.cur.execute("SELECT " + stat_fields_str + " FROM metadata_view " + "WHERE id in %s", (tuple(genome_id_list),)) rows = [] for d in self.cur: rows.append(d[1:]) except GenomeDatabaseError as e: raise e return stat_fields[1:], rows
def _hasPermissionToEditGenomes(self, db_genome_ids): ''' Function _hasPermissionToEditGenomes Check if a user is entitled to delete genomes. Users can delete their own genomes, Admin can delete any genomes :param db_genome_ids:list of genomes is to delete Return a tuple containing: - Boolean returning the state of the function - The username currently running the delete function -a Dictionary listing the list of genomes and where each genome has saved the owner,the Prefix (U,GB or RS) , the relative path ''' try: if not db_genome_ids: raise GenomeDatabaseError( "Unable to retrieve genome permissions, no genomes given: %s" % str(db_genome_ids)) self.cur.execute( "SELECT gs.external_id_prefix,gs.external_id_prefix || '_'|| genomes.id_at_source, owner_id, username, owned_by_root,fasta_file_location " "FROM genomes " + "LEFT OUTER JOIN users ON genomes.owner_id = users.id " + "LEFT JOIN genome_sources gs ON gs.id = genomes.genome_source_id " + "WHERE genomes.id in %s", (tuple(db_genome_ids), )) dict_genomes_user = {} for (prefix, public_id, owner_id, username, owned_by_root, fasta_path) in self.cur: if not self.currentUser.isRootUser(): if (owned_by_root or owner_id != self.currentUser.getUserId()): print( "WARNING: Insufficient permissions to edit genome {0}" .format(public_id)) print logging.warn( "{0} is trying to delete genome {1} owned by {2}". format(self.currentUser.getUsername(), public_id, username)) return (False, None, None) dict_genomes_user[public_id] = { "owner": username, "prefix": prefix, "relative_path": fasta_path } if self.currentUser.isRootUser(): current_username = self.currentUser.getElevatedFromUsername() else: current_username = self.currentUser.getUsername() except GenomeDatabaseError as e: raise e return (True, current_username, dict_genomes_user)
def addUser(self, username, firstname, lastname, rolename=None, has_root=False): try: if rolename is None: rolename = 'user' if (not self.currentUser.isRootUser()): if has_root: raise GenomeDatabaseError( "Only the root user may grant root access to new users." ) if rolename == 'admin': raise GenomeDatabaseError( "Only the root user may create admin accounts.") if not (self.currentUser.getRolename() == 'admin' and rolename == 'user'): raise GenomeDatabaseError( "Only admins (and root) can create user accounts.") self.cur.execute("SELECT username from users where username = %s", (username, )) if len(self.cur.fetchall()) > 0: raise GenomeDatabaseError( "User %s already exists in the database." % username) self.cur.execute( "INSERT into users (username,firstname,lastname, role_id, has_root_login) (" + "SELECT %s,%s,%s, id, %s " + "FROM user_roles " + "WHERE name = %s)", (username, firstname, lastname, has_root, rolename)) except GenomeDatabaseError as e: raise e except: raise return True
def _identifyHeadersCheckM(self, checkm_fh): """Parse header information from CheckM file. Parameters ---------- checkm_fh : file Handle to CheckM file. """ required_headers = { "Bin Id": None, "Completeness": None, "Contamination": None, "Marker lineage": None, "# genomes": None, "# markers": None, "# marker sets": None, "Strain heterogeneity": None } # Check the CheckM headers are consistent split_headers = checkm_fh.readline().rstrip().split("\t") for pos in range(0, len(split_headers)): header = split_headers[pos] if header not in required_headers: continue if required_headers[header] is not None: raise GenomeDatabaseError( "Seen %s header twice in the CheckM file. Check that the CheckM file is correct: %s." % (header, checkm_fh.name)) required_headers[header] = pos for header, col in required_headers.items(): if (header is "Completeness" or header is "Contamination") and col is None: raise GenomeDatabaseError( "Unable to find %s header in the CheckM file. Check that the CheckM file is correct: %s." % (header, checkm_fh.name)) return required_headers
def permissionToModify(self, genome_list_id): """Check if user has permission to modify genome list. Parameters ---------- genome_list_id : int Unique identifier of genome list in database. Returns ------- bool True if has permission, else False. """ try: self.cur.execute( "SELECT owner_id, owned_by_root " + "FROM genome_lists " + "WHERE id = %s ", (genome_list_id, )) result = self.cur.fetchone() if not result: raise GenomeDatabaseError("No genome list with id: %s" % str(genome_list_id)) (owner_id, owned_by_root) = result if not self.currentUser.isRootUser(): if owned_by_root or owner_id != self.currentUser.getUserId(): return False else: if not owned_by_root: raise GenomeDatabaseError( "Root user editing of other users lists not yet implemented." ) except GenomeDatabaseError as e: raise e return True
def rootLogin(self, username): try: query = "SELECT id, has_root_login FROM users WHERE username = %s" self.cur.execute(query, [username]) result = self.cur.fetchone() self.cur.close() if result: (_userid, has_root_login) = result if not has_root_login: raise GenomeDatabaseError( "You do not have sufficient permissions to logon as the root user." ) self.currentUser = User.createRootUser(username) else: raise GenomeDatabaseError("User %s not found." % username) except GenomeDatabaseError as e: raise e return self.currentUser
def _processCheckM(self, checkm_file): """Parse information from CheckM file. Parameters ---------- checkm_file : str Name of file containing CheckM results. Returns ------- dict CheckM statistics for each genome. """ try: checkm_fh = open(checkm_file, "rb") except: raise GenomeDatabaseError("Cannot open CheckM file: " + checkm_file) required_headers = self._identifyHeadersCheckM(checkm_fh) # populate CheckM results dict checkm_results_dict = {} for line in checkm_fh: line = line.rstrip() splitline = line.split("\t") bin_id = splitline[required_headers["Bin Id"]] completeness = splitline[required_headers["Completeness"]] contamination = splitline[required_headers["Contamination"]] lineage = splitline[required_headers["Marker lineage"]] genome_count = splitline[required_headers["# genomes"]] marker_count = splitline[required_headers["# markers"]] set_count = splitline[required_headers["# marker sets"]] heterogeneity = splitline[required_headers["Strain heterogeneity"]] checkm_results_dict[bin_id] = { "completeness": completeness, "contamination": contamination, "lineage": lineage, "genome_count": genome_count, "marker_count": marker_count, "set_count": set_count, "heterogeneity": heterogeneity } checkm_fh.close() return checkm_results_dict
def createMetadata(self, metadatafile): ''' Function createMetadata Create or Update metaddata columns in the database :param metadatafile: TSV file listing one new field per line Format of the TSV file is new_field \t description \t type \t table ''' try: data_dict = {} with open(metadatafile, 'r') as metaf: for line in metaf: array_line = line.strip().split('\t') if not array_line[3].startswith("metadata_"): raise GenomeDatabaseError( "Only Metadata Tables can be modified") data_dict[array_line[0]] = { "table": array_line[3], "type": array_line[2], "desc": array_line[1] } query = "SELECT v.field,v.table from view_list_meta_columns as v" self.cur.execute(query) all_col_dict = dict(self.cur.fetchall()) for key, value in data_dict.iteritems(): if key in all_col_dict: if all_col_dict.get(key) == value['table']: query_comment = "COMMENT ON COLUMN {0}.{1} IS '{2}'".format( value['table'], key, value['desc']) self.cur.execute(query_comment) else: logging.warning( "Column {0} is already presents in the {1} table ." .format(key, all_col_dict.get(key))) else: query_add_col = "ALTER TABLE {0} ADD COLUMN {1} {2}".format( value['table'], key, value['type']) self.cur.execute(query_add_col) query_add_comment = "COMMENT ON COLUMN {0}.{1} IS '{2}'".format( value['table'], key, value['desc']) self.cur.execute(query_add_comment) # ---------- PSQL are not refresh automatically so we need to drop the existing view and recreate it with a new Definition. self.cur.execute("SELECT refreshView()") except GenomeDatabaseError as e: raise e
def editUser(self, username, rolename=None, has_root=None, firstname=None, lastname=None): try: if (not self.currentUser.isRootUser()): raise GenomeDatabaseError( "Only the root user may edit existing accounts.") conditional_queries = [] params = [] if rolename is not None: conditional_queries.append( " role_id = (SELECT id from user_roles where name = %s) ") params.append(rolename) print has_root if has_root is not None: conditional_queries.append(" has_root_login = %s ") params.append(has_root) if firstname is not None: conditional_queries.append(" firstname = %s ") params.append(firstname) if lastname is not None: conditional_queries.append(" lastname = %s ") params.append(lastname) if params: self.cur.execute( "UPDATE users " + "SET " + ','.join(conditional_queries) + " " "WHERE username = %s", params + [username]) except GenomeDatabaseError as e: raise e except Exception as e: raise e return True
def printGenomeDetails(self, genome_id_list): """Print database details of genomes. Parameters ---------- genome_id_list : iterable Unique identifier of genomes in database. Returns ------- list Column headers. list Content for each row. """ try: if not genome_id_list: raise GenomeDatabaseError( "Unable to print genomes. No genomes found.") columns = "genomes.id, genomes.name, description, owned_by_root, username, " + \ "external_id_prefix || '_' || id_at_source as external_id, date_added" self.cur.execute( "SELECT " + columns + " FROM genomes " + "LEFT OUTER JOIN users ON genomes.owner_id = users.id " + "JOIN genome_sources AS sources ON genome_source_id = sources.id " + "AND genomes.id in %s " + "ORDER BY genomes.id ASC", (tuple(genome_id_list), )) header = ("genome_id", "name", "description", "owner", "data_added") rows = [] for (_genome_id, name, description, owned_by_root, username, external_id, date_added) in self.cur: rows.append((external_id, name, description, ("root" if owned_by_root else username), date_added.date())) except GenomeDatabaseError as e: raise e return header, rows
def printGenomeListsDetails(self, genome_list_ids): """Print genome list details. Parameters ---------- genome_list_ids : iterable Unique identifier of genome lists in database. Returns ------- list Column headers. list Content for each row. """ try: if not genome_list_ids: raise GenomeDatabaseError( "Unable to print genome details: No genomes given.") self.cur.execute( "SELECT lists.id, lists.name, lists.owned_by_root, users.username, count(contents.list_id) " + "FROM genome_lists as lists " + "LEFT OUTER JOIN users ON lists.owner_id = users.id " + "JOIN genome_list_contents as contents ON contents.list_id = lists.id " + "WHERE lists.id in %s " + "GROUP by lists.id, users.username " + "ORDER by lists.display_order asc, lists.id", (tuple(genome_list_ids), )) header = ("list_id", "name", "owner", "genome_count") rows = [] for (list_id, name, owned_by_root, username, genome_count) in self.cur: rows.append( (list_id, name, ("root" if owned_by_root else username), genome_count)) except GenomeDatabaseError as e: raise e return header, rows
def printMarkerSetsDetails(self, marker_set_ids): """Print marker set details. Parameters ---------- genome_list_ids : iterable Unique identifier of marker sets in database. Returns ------- list Column headers. list Content for each row. """ try: if not marker_set_ids: raise GenomeDatabaseError( "Unable to print marker set details: No marker sets given." ) self.cur.execute( "SELECT sets.id, sets.name, sets.description, sets.owned_by_root, users.username, count(contents.set_id) " + "FROM marker_sets as sets " + "LEFT OUTER JOIN users ON sets.owner_id = users.id " + "JOIN marker_set_contents as contents ON contents.set_id = sets.id " + "WHERE sets.id in %s " + "GROUP by sets.id, users.username " + "ORDER by sets.id asc ", (tuple(marker_set_ids), )) header = ("set_id", "name", "description", "owner", "marker_count") rows = [] for (set_id, name, description, owned_by_root, username, marker_count) in self.cur: rows.append( (set_id, name, description, ("root" if owned_by_root else username), marker_count)) except GenomeDatabaseError as e: raise e return header, rows
def userLogin(self, username): try: self.cur.execute( "SELECT users.id, user_roles.id, user_roles.name " "FROM users, user_roles " + "WHERE users.role_id = user_roles.id " + "AND users.username = %s", (username, )) result = self.cur.fetchone() if not result: raise GenomeDatabaseError("User not found: %s" % username) (user_id, role_id, rolename) = result self.currentUser = User.createUser(user_id, username, rolename, role_id) except GenomeDatabaseError as e: raise e return self.currentUser
def printMarkerDetails(self, marker_id_list): """Print marker gene details. Parameters ---------- marker_id_list : iterable Unique identifier of markers in database. Returns ------- bool True if successful, else False. """ try: if not marker_id_list: raise GenomeDatabaseError( "Unable to print markers. No markers found.") columns = "markers.id, markers.name, description, " + \ "external_id_prefix || '_' || id_in_database as external_id, size" self.cur.execute( "SELECT " + columns + " FROM markers " + "LEFT OUTER JOIN users ON markers.owner_id = users.id " + "JOIN marker_databases AS databases ON marker_database_id = databases.id " + "AND markers.id in %s " + "ORDER BY markers.id ASC", (tuple(marker_id_list), )) # print table header = ("Marker ID", "Name", "Description", "Length (aa)") rows = [] for (_marker_id, name, description, external_id, size) in self.cur: rows.append((external_id, name, description, size)) except GenomeDatabaseError as e: raise e return header, rows
def editMarkerSet(self, marker_set_id, marker_ids=None, operation=None, name=None, description=None, private=None): """Edit an existing marker set in the database. Parameters ---------- marker_set_id : int Identifier of marker set in database. batchfile : str Filename of batch file describing markers to modify. marker_external_ids : list List of markers to modify. operation : str Operation to perform on genome list (add or remove). name : str Name of the newly created list. description : str Description of the newly created list. private : bool Denotes whether this list is public or private. Returns ------- bool True if successful, else False """ try: edit_permission = self.permissionToModify(marker_set_id) if edit_permission is None: raise GenomeDatabaseError( "Unable to retrieve marker set id for editing. Offending set id: %s" % marker_set_id) elif edit_permission is False: raise GenomeDatabaseError( "Insufficient permissions to edit this marker set. Offending set id: %s" % marker_set_id) update_query = [] params = [] if name is not None: update_query.append("name = %s") params.append(name) if description is not None: update_query.append("description = %s") params.append(description) if private is not None: update_query.append("private = %s") params.append(private) if params: self.cur.execute( "UPDATE marker_sets SET " + ",".join(update_query) + " WHERE id = %s", params + [marker_set_id]) temp_table_name = Tools.generateTempTableName() if operation is not None: if len(marker_ids) == 0: raise GenomeDatabaseError( "No marker ids given to perform '%s' operation." % operation) self.cur.execute("CREATE TEMP TABLE %s (id integer)" % (temp_table_name, )) query = "INSERT INTO {0} (id) VALUES (%s)".format( temp_table_name) self.cur.executemany(query, [(x, ) for x in marker_ids]) if operation == 'add': query = ( "INSERT INTO marker_set_contents (set_id, marker_id) " + "SELECT %s, id FROM {0} " + "WHERE id NOT IN ( " + "SELECT marker_id " + "FROM marker_set_contents " + "WHERE set_id = %s)").format(temp_table_name) self.cur.execute(query, (marker_set_id, marker_set_id)) elif operation == 'remove': query = ("DELETE FROM marker_set_contents " + "WHERE set_id = %s " + "AND marker_id IN ( " + "SELECT id " + "FROM {0})").format(temp_table_name) self.cur.execute(query, [marker_set_id]) query_is_empty = ( "SELECT count(msc.marker_id) from marker_sets as ms " + "LEFT JOIN marker_set_contents as msc on msc.set_id = ms.id " + "WHERE ms.id = {0} " + "GROUP BY ms.id").format(marker_set_id) self.cur.execute(query_is_empty) count = self.cur.fetchone() if count[0] == 0: # We delete the list because it's empty query_del_set = ( "DELETE FROM marker_sets WHERE id = {0} " ).format(marker_set_id) self.cur.execute(query_del_set) # we deletethe aligned markers not associated with any marker_sets query_get_distinct_mkrs = "SELECT DISTINCT marker_id FROM marker_set_contents" self.cur.execute(query_get_distinct_mkrs) processed_results = [ genome_id for (genome_id, ) in self.cur.fetchall() ] mkrs_to_del = [ x for x in marker_ids if x not in processed_results ] query_del_aligned_mkrs = "DELETE FROM aligned_markers where marker_id in %s" self.cur.execute(query_del_aligned_mkrs, (tuple(mkrs_to_del), )) else: raise GenomeDatabaseError( "Unknown marker set edit operation: %s" % operation) except GenomeDatabaseError as e: raise e return True
def filterGenomes(self, marker_ids, genome_ids, quality_threshold, quality_weight, comp_threshold, cont_threshold, min_perc_aa, min_rep_perc_aa, taxa_filter, guaranteed_taxa_filter, genomes_to_exclude, guaranteed_ids, rep_ids, directory, prefix): """Filter genomes based on provided criteria. Parameters ---------- Returns ------- set Database identifiers of retained genomes. """ if not os.path.exists(directory): os.makedirs(directory) # get mapping from db genome IDs to external IDs genome_mngr = GenomeManager(self.cur, self.currentUser) external_ids = genome_mngr.genomeIdsToExternalGenomeIds(genome_ids) filter_genome_file = os.path.join(directory, prefix + '_filtered_genomes.tsv') fout_filtered = open(filter_genome_file, 'w') self.logger.info('Filtering initial set of %d genomes.' % len(genome_ids)) extra_guaranteed_ids = [ x for x in guaranteed_ids if x not in genome_ids ] if len(extra_guaranteed_ids) > 0: self.logger.warning( 'Identified {0} guaranteed genomes absent from specified input genomes (Those genomes will not appear in the final tree).' .format(len(extra_guaranteed_ids))) guaranteed_ids = [x for x in guaranteed_ids if x in genome_ids] self.logger.info( 'Identified %d genomes to be excluded from filtering.' % len(guaranteed_ids)) # for all markers, get the expected marker size self.cur.execute( "SELECT markers.id, markers.name, description, id_in_database, size, external_id_prefix " + "FROM markers, marker_databases " + "WHERE markers.id in %s " "AND markers.marker_database_id = marker_databases.id " "ORDER by external_id_prefix ASC, id_in_database ASC", (tuple(marker_ids), )) chosen_markers = dict() chosen_markers_order = [] total_alignment_len = 0 for marker_id, marker_name, marker_description, id_in_database, size, external_id_prefix in self.cur: chosen_markers[marker_id] = { 'external_id_prefix': external_id_prefix, 'name': marker_name, 'description': marker_description, 'id_in_database': id_in_database, 'size': size } chosen_markers_order.append(marker_id) total_alignment_len += size # filter genomes based on taxonomy genomes_to_retain = genome_ids if taxa_filter: new_genomes_to_retain = self._taxa_filter(taxa_filter, genomes_to_retain, guaranteed_ids, retain_guaranteed=True) for genome_id in genomes_to_retain - new_genomes_to_retain: rep_str = 'Representative' if genome_id in rep_ids else '' fout_filtered.write( '%s\t%s\t%s\n' % (external_ids[genome_id], 'Filtered on taxonomic affiliation.', rep_str)) genomes_to_retain = new_genomes_to_retain if guaranteed_taxa_filter: new_genomes_to_retain = self._taxa_filter(guaranteed_taxa_filter, genomes_to_retain, guaranteed_ids, retain_guaranteed=False) for genome_id in genomes_to_retain - new_genomes_to_retain: rep_str = 'Representative' if genome_id in rep_ids else '' fout_filtered.write( '%s\t%s\t%s\n' % (external_ids[genome_id], 'Filtered on guaranteed taxonomic affiliation.', rep_str)) genomes_to_retain = new_genomes_to_retain # find genomes based on completeness, contamination, or genome quality self.logger.info( 'Filtering genomes with completeness <%.1f%%, contamination >%.1f%%, or quality <%.1f%% (weight = %.1f).' % (comp_threshold, cont_threshold, quality_threshold, quality_weight)) filtered_genomes = self._filterOnGenomeQuality(genomes_to_retain, quality_threshold, quality_weight, comp_threshold, cont_threshold) # sanity check representatives are not of poor quality final_filtered_genomes = set() for genome_id, quality in filtered_genomes.iteritems(): if genome_id not in guaranteed_ids: if genome_id in rep_ids: self.logger.warning( 'Retaining representative genome %s despite poor estimated quality (comp=%.1f%%, cont=%.1f%%).' % (external_ids[genome_id], quality[0], quality[1])) else: final_filtered_genomes.add(genome_id) fout_filtered.write( '%s\t%s\t%.2f\t%.2f\n' % (external_ids[genome_id], 'Filtered on quality (completeness, contamination).', quality[0], quality[1])) self.logger.info( 'Filtered %d genomes based on completeness, contamination, and quality.' % len(final_filtered_genomes)) genomes_to_retain -= final_filtered_genomes # filter genomes explicitly specified for exclusion if genomes_to_exclude: for genome_id in genomes_to_exclude: if genome_id in external_ids: fout_filtered.write('%s\t%s\n' % (external_ids[genome_id], 'Explicitly marked for exclusion.')) conflicting_genomes = guaranteed_ids.intersection( genomes_to_exclude) if conflicting_genomes: raise GenomeDatabaseError( 'Genomes marked for both retention and exclusion, e.g.: %s' % conflicting_genomes.pop()) new_genomes_to_retain = genomes_to_retain.difference( genomes_to_exclude) self.logger.info( 'Filtered %d genomes explicitly indicated for exclusion.' % (len(genomes_to_retain) - len(new_genomes_to_retain))) genomes_to_retain = new_genomes_to_retain # filter genomes with insufficient number of amino acids in MSA self.logger.info( 'Filtering genomes with insufficient amino acids in the MSA.') filter_on_aa = set() for genome_id in genomes_to_retain: aligned_marker_query = ( "SELECT sequence, multiple_hits,hit_number,unique_genes " + "FROM aligned_markers " + "WHERE genome_id = %s " + "AND sequence is NOT NULL " + "AND marker_id IN %s") self.cur.execute(aligned_marker_query, (genome_id, tuple(marker_ids))) total_aa = 0 for sequence, multiple_hits, hit_number, unique_genes in self.cur: if not multiple_hits: total_aa += len(sequence) - sequence.count('-') elif unique_genes == 1: total_aa += len(sequence) - sequence.count('-') # should retain guaranteed genomes unless they have zero amino # acids in MSA if genome_id in guaranteed_ids: if total_aa != 0: continue else: self.logger.warning( 'Filtered guaranteed genome %s with zero amino acids in MSA.' % external_ids[genome_id]) perc_alignment = total_aa * 100.0 / total_alignment_len if perc_alignment < min_perc_aa: rep_str = '' if genome_id in rep_ids: if perc_alignment < min_rep_perc_aa: rep_str = 'Representative' self.logger.warning( 'Filtered representative genome %s due to lack of aligned amino acids (%.1f%%).' % (external_ids[genome_id], perc_alignment)) else: self.logger.warning( 'Retaining representative genome %s despite small numbers of aligned amino acids (%.1f%%).' % (external_ids[genome_id], perc_alignment)) continue filter_on_aa.add(genome_id) fout_filtered.write('%s\t%s\t%d\t%.1f\t%s\n' % ( external_ids[genome_id], 'Insufficient number of amino acids in MSA (total AA, % alignment length)', total_aa, perc_alignment, rep_str)) fout_filtered.close() self.logger.info( 'Filtered %d genomes with insufficient amino acids in the MSA.' % len(filter_on_aa)) genomes_to_retain.difference_update(filter_on_aa) self.logger.info('Producing tree data for %d genomes.' % len(genomes_to_retain)) good_genomes_file = os.path.join(directory, prefix + '_good_genomes.tsv') good_genomes = open(good_genomes_file, 'w') for item in genomes_to_retain: good_genomes.write("{0}\n".format(item)) good_genomes.close() return (genomes_to_retain, chosen_markers_order, chosen_markers)
def runTreeWeightedExceptions(self, path, comp, conta, qweight, qt): ''' Function: runTreeWeightedException Export list of NCBI genomes that do comply the filter criteria but are of interest. :param path: Path to the output file ''' try: if (not self.currentUser.isRootUser()): raise GenomeDatabaseError( "Only the root user can run this command") return False self.cur.execute( "SELECT id,mt.ncbi_taxonomy FROM genomes g " + "LEFT JOIN metadata_genes mg USING (id) " + "LEFT JOIN metadata_taxonomy mt USING (id) " + "LEFT JOIN metadata_ncbi mn USING (id) " + "WHERE g.genome_source_id IN (2,3) and (mt.gtdb_genome_representative is not NULL or " + "(mt.gtdb_genome_representative is NULL and mg.checkm_completeness > %s and mg.checkm_contamination < %s " + "and mg.checkm_completeness-%s*mg.checkm_contamination > %s)) and mt.ncbi_taxonomy is not NULL", (comp, conta, qweight, qt)) print self.cur.mogrify( "SELECT id,mt.ncbi_taxonomy FROM genomes g " + "LEFT JOIN metadata_genes mg USING (id) " + "LEFT JOIN metadata_taxonomy mt USING (id) " + "LEFT JOIN metadata_ncbi mn USING (id) " + "WHERE g.genome_source_id IN (2,3) and (mt.gtdb_genome_representative is not NULL or " + "(mt.gtdb_genome_representative is NULL and mg.checkm_completeness > %s and mg.checkm_contamination < %s " + "and mg.checkm_completeness-%s*mg.checkm_contamination > %s)) and mt.ncbi_taxonomy is not NULL", (comp, conta, qweight, qt)) processed_results = zip(*self.cur) existing_id = processed_results[0] existing_taxonomy = processed_results[1] order_list = [x.split(';')[3] for x in existing_taxonomy] self.cur.execute( "SELECT g.id,g.name,mg.checkm_completeness,mg.checkm_contamination,mt.ncbi_taxonomy,mnuc.genome_size,(mg.checkm_completeness-4*mg.checkm_contamination) as quality_threshold,mn.ncbi_organism_name " + "FROM genomes g " + "LEFT JOIN metadata_genes mg USING (id) " + "LEFT JOIN metadata_ncbi mn USING (id) " + "LEFT JOIN metadata_nucleotide mnuc USING (id) " + "LEFT JOIN metadata_taxonomy mt USING (id) " + "WHERE g.genome_source_id IN (2,3) and " + "(mg.checkm_completeness > %s and mg.checkm_contamination < %s " + "and mg.checkm_completeness-4*mg.checkm_contamination > %s) and mt.ncbi_taxonomy is not NULL and g.id not in %s", (DefaultValues.EXCEPTION_FILTER_ONE_CHECKM_COMPLETENESS, DefaultValues.EXCEPTION_FILTER_ONE_CHECKM_CONTAMINATION, DefaultValues.EXCEPTION_FILTER_ONE_QUALITY_THRESHOLD, existing_id)) dict_except_order = {} for (gid, name, compl, conta, ncbitax, size, qual, orga) in self.cur: if ncbitax.split(';')[3] != 'o__': if self._checkTaxonomyUniqueness(ncbitax, order_list): if self._checkTaxonomyUniqueness( ncbitax, dict_except_order): dict_except_order[ncbitax.split(';')[3]] = { 'quality': float(qual), 'id': gid, 'full_info': [ name, compl, conta, ncbitax, size, qual, orga, 'First' ] } else: if dict_except_order.get(ncbitax.split( ';')[3]).get('quality') < float(qual): dict_except_order[ncbitax.split(';')[3]] = { 'quality': float(qual), 'id': gid, 'full_info': [ name, compl, conta, ncbitax, size, qual, orga, 'First' ] } else: dict_except_order[gid] = { 'quality': float(qual), 'id': gid, 'full_info': [ name, compl, conta, ncbitax, size, qual, orga, 'unknown order' ] } fh = open(path, "w") fh.write( "Name,CheckM_Completeness,CheckM_Contamination,NCBI_Taxonomy,Genome_size,Quality_Threshold,Organism_name,Filter_passed\n" ) for _k, item in dict_except_order.iteritems(): fh.write(",".join(str(v) for v in item.get('full_info')) + "\n") fh.close() except GenomeDatabaseError as e: raise e return True
def runSanityCheck(self): try: if (not self.currentUser.isRootUser()): raise GenomeDatabaseError( "Only the root user can run this command") # validate type strains self.logger.info( 'Validating 5S, 16S, and 23S count and gene length data.') self._validateSSU_LSU() # validate type strains self.logger.info('Validating type strain.') self._validateTypeStrains() # validate MIMAG assignments self.logger.info('Validating MIMAG assignments.') self._validateMIMAG() # check if the representatives are still in the database query = ("SELECT id FROM genomes where genome_source_id in (2,3)") self.cur.execute(query) ncbi_ids = [gid for (gid, ) in self.cur] query = ("SELECT id,id_at_source FROM genomes") self.cur.execute(query) raw_ids = [(gid, source_id) for (gid, source_id) in self.cur] all_ids, all_source_ids = zip(*raw_ids) dict_all_ids = {k: v for (k, v) in raw_ids} query = ( "SELECT distinct(gtdb_genome_representative) from metadata_taxonomy where gtdb_genome_representative is not NULL" ) self.cur.execute(query) representatives = [ self._chompRecord(record) for (record, ) in self.cur ] for representative in representatives: if representative not in all_source_ids: print "REPRESENTATIVE {0} has been removed from the database".format( representative) query = ( "SELECT id,protein_count,ncbi_submitter from metadata_genes LEFT JOIN metadata_ncbi using (id) WHERE id in (SELECT id from genomes where genome_source_id in (2,3));" ) self.cur.execute(query) dict_meta_ncbi = { gid: { "count": count, "submitter": project } for (gid, count, project) in self.cur } for ncbi_genome in ncbi_ids: if ncbi_genome not in dict_meta_ncbi: print "{0} has no metadata in metadata_ncbi".format( dict_all_ids[ncbi_genome]) else: if dict_meta_ncbi[ncbi_genome][ "count"] is None or dict_meta_ncbi[ncbi_genome][ "count"] == '' or dict_meta_ncbi[ncbi_genome][ "count"] == 0: print "{0} protein_count value in metadata_nucleotide is {1}".format( dict_all_ids[ncbi_genome], dict_meta_ncbi[ncbi_genome]["count"]) if dict_meta_ncbi[ncbi_genome][ "submitter"] is None or dict_meta_ncbi[ ncbi_genome]["submitter"] == '': print "{0} ncbi_submitter value in metadata_ncbi is {1}".format( dict_all_ids[ncbi_genome], dict_meta_ncbi[ncbi_genome]["submitter"]) query = ( "SELECT id,checkm_completeness,protein_count from metadata_genes" ) self.cur.execute(query) dict_meta_genes = { gid: { "checkm": checkm, "protein_count": count } for (gid, checkm, count) in self.cur } for genome in all_ids: if genome not in dict_meta_genes: print "{0} has no metadata in metadata_genes".format( dict_all_ids[genome]) else: if dict_meta_genes[genome][ "checkm"] is None or dict_meta_genes[genome][ "checkm"] == '': print "{0} checkm_completeness value in metadata_genes is {1}".format( dict_all_ids[genome], dict_meta_genes[genome]["checkm"]) if dict_meta_genes[genome][ "protein_count"] is None or dict_meta_genes[genome][ "protein_count"] == '' or dict_meta_genes[ genome]["protein_count"] == 0: print "{0} protein_count value in metadata_genes is {1}".format( dict_all_ids[genome], dict_meta_genes[genome]["protein_count"]) query = ("SELECT id,gc_count from metadata_nucleotide") self.cur.execute(query) dict_meta_nuc = {gid: {"gc": gc} for (gid, gc) in self.cur} for genome in all_ids: if genome not in dict_meta_nuc: print "{0} has no metadata in metadata_nucleotide".format( dict_all_ids[genome]) else: if dict_meta_nuc[genome]["gc"] is None or dict_meta_nuc[ genome]["gc"] == '' or dict_meta_nuc[genome][ "gc"] == 0: print "{0} gc_count value in metadata_nucleotide is {1}".format( dict_all_ids[genome], dict_meta_nuc[genome]["gc"]) except GenomeDatabaseError as e: raise e return True
def deleteGenomes(self, batchfile=None, db_genome_ids=None, reason=None): ''' Delete Genomes Returns True for success or False for fail Parameters: :param batchfile: text file listing a range of ids to delete :param db_genome_ids: a list of ids can be written directly in the command line ''' self._loggerSetup() try: if db_genome_ids is False: raise GenomeDatabaseError( "Unable to delete genomes. Unable to retrieve genome ids.") # restrict deletion to genomes owned by user has_permission, username, genomes_owners = self._hasPermissionToEditGenomes( db_genome_ids) if has_permission is None: raise GenomeDatabaseError( "Unable to delete genomes. Unable to retrieve permissions for genomes." ) if has_permission is False: raise GenomeDatabaseError( "Unable to delete genomes. Insufficient permissions.") if db_genome_ids: if not confirm( "Are you sure you want to delete %i genomes (this action cannot be undone)" % len(db_genome_ids)): raise GenomeDatabaseError("User aborted database action.") self.cur.execute( "DELETE FROM aligned_markers " + "WHERE genome_id IN %s ", (tuple(db_genome_ids), )) self.cur.execute( "DELETE FROM genome_list_contents " + "WHERE genome_id IN %s", (tuple(db_genome_ids), )) # Deletion of metadata self.cur.execute( "DELETE FROM metadata_genes " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute( "DELETE FROM metadata_ncbi " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute( "DELETE FROM metadata_nucleotide " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute( "DELETE FROM metadata_taxonomy " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute( "DELETE FROM metadata_rna " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute( "DELETE FROM metadata_sequence " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute("DELETE FROM genomes " + "WHERE id IN %s", (tuple(db_genome_ids), )) self.cur.execute( "UPDATE metadata_taxonomy set gtdb_genome_representative = NULL where " + "gtdb_genome_representative in %s", (tuple(genomes_owners.keys()), )) for genome, info in genomes_owners.iteritems(): if str(username) != str(info.get("owner")): logging.info( '''Genome {0} has been deleted by {1} for the following reason '{2}' WARNING: {1} is not the owner of this {0} (real owner {3} ) {0} needs to be moved manually to the deprecated folder''' .format(genome, username, reason, info.get("owner"))) else: if info.get("prefix") is "U": target = os.path.dirname( os.path.join(self.deprecatedUserDir, info.get("relative_path"))) elif info.get("prefix") is "GB": target = os.path.join(self.deprecatedGBKDir, info.get("relative_path")) elif info.get("prefix") is "RS": target = os.path.join(self.deprecatedRSQDir, info.get("relative_path")) make_sure_path_exists(target) os.rename( os.path.dirname( Tools.fastaPathGenerator( info.get("relative_path"), info.get("prefix"))), target) logging.info( "Genome {0} has been deleted by {1} for the following reason '{2}'" .format(genome, username, reason)) except GenomeDatabaseError as e: raise e return True
def editGenomeList(self, genome_list_id, genome_ids=None, operation=None, name=None, description=None, private=None): """Edit an existing genome list in the database. Parameters ---------- genome_list_id : int Identifier of genome list in database. genome_ids : list A list of genome ids to be modified. operation : str Operation to perform on genome list (add or remove). name : str Name of the newly created list. description : str Description of the newly created list. private : bool Denotes whether this list is public or private. Returns ------- bool True if successful, else False """ try: edit_permission = self.permissionToModify(genome_list_id) if edit_permission is None: raise GenomeDatabaseError( "Unable to retrieve genome list id for editing. Offending list id: %s" % genome_list_id) if edit_permission is False: raise GenomeDatabaseError( "Insufficient permissions to edit this genome list. Offending list id: %s" % genome_list_id) update_query = [] params = [] if name is not None: update_query.append("name = %s") params.append(name) if description is not None: update_query.append("description = %s") params.append(description) if private is not None: update_query.append("private = %s") params.append(private) if params: self.cur.execute( "UPDATE genome_lists SET " + ",".join(update_query) + " WHERE id = %s", params + [genome_list_id]) temp_table_name = generateTempTableName() if operation is not None: if len(genome_ids) == 0: raise GenomeDatabaseError( "No genome ids given to perform '%s' operation." % operation) self.cur.execute("CREATE TEMP TABLE %s (id integer)" % (temp_table_name, )) query = "INSERT INTO {0} (id) VALUES (%s)".format( temp_table_name) self.cur.executemany(query, [(x, ) for x in genome_ids]) if operation == 'add': query = ( "INSERT INTO genome_list_contents (list_id, genome_id) " + "SELECT %s, id FROM {0} " + "WHERE id NOT IN ( " + "SELECT genome_id " + "FROM genome_list_contents " + "WHERE list_id = %s)").format(temp_table_name) self.cur.execute(query, (genome_list_id, genome_list_id)) elif operation == 'remove': query = ("DELETE FROM genome_list_contents " + "WHERE list_id = %s " + "AND genome_id IN ( " + "SELECT id " + "FROM {0})").format(temp_table_name) self.cur.execute(query, [genome_list_id]) query_is_empty = ( "SELECT count(glc.genome_id) from genome_lists as gl " + "LEFT JOIN genome_list_contents as glc on glc.list_id = gl.id " + "WHERE gl.id = {0} " + "GROUP BY gl.id").format(genome_list_id) self.cur.execute(query_is_empty) count = self.cur.fetchone() if count[0] == 0: # We delete the list because it's empty query_del_list = ( "DELETE FROM genome_lists WHERE id = {0} " ).format(genome_list_id) self.cur.execute(query_del_list) else: raise GenomeDatabaseError( "Unknown genome set edit operation: %s" % operation) except GenomeDatabaseError as e: raise e return True
def _storeMetadata(self, db_genome_id, genome_dir): """Parse metadata files for genome and store in database. Parameters ---------- db_genome_id : str Unique database identifier of genome. genome_dir : str Directory containing metadata files to parse. """ try: # nucleotide metadata metadata_nt_path = os.path.join(genome_dir, ConfigMetadata.GTDB_NT_FILE) genome_list_nt = [ tuple(line.rstrip().split('\t')) for line in open(metadata_nt_path) ] query_nt = "UPDATE metadata_nucleotide SET %s = %s WHERE id = {0}".format( db_genome_id) for c, v in genome_list_nt: try: v = float(v) self.cur.execute(query_nt, [AsIs(c), v]) except: self.cur.execute(query_nt, [AsIs(c), v]) # protein metadata metadata_gene_path = os.path.join(genome_dir, ConfigMetadata.GTDB_GENE_FILE) genome_list_gene = [ tuple(line.rstrip().split('\t')) for line in open(metadata_gene_path) ] query_gene = "UPDATE metadata_genes SET %s = %s WHERE id = {0}".format( db_genome_id) for c, v in genome_list_gene: try: v = float(v) self.cur.execute(query_gene, [AsIs(c), v]) except: self.cur.execute(query_gene, [AsIs(c), v]) # Greengenes SSU metadata query_taxonomy = "UPDATE metadata_rna SET %s = %s WHERE id = {0}".format( db_genome_id) query_sequence = "UPDATE metadata_rrna_sequences SET %s = %s WHERE id = {0}".format( db_genome_id) metadata_ssu_gg_path = os.path.join( genome_dir, ConfigMetadata.GTDB_SSU_GG_OUTPUT_DIR, ConfigMetadata.GTDB_SSU_FILE) genome_list_taxonomy, _ssu_count, ssu_query_id = self._parse_taxonomy_file( metadata_ssu_gg_path, ConfigMetadata.GTDB_SSU_GG_PREFIX) if genome_list_taxonomy: for c, v in genome_list_taxonomy: try: if "blast_subject_id" not in c: v = float(v) self.cur.execute(query_taxonomy, [AsIs(c), v]) except: self.cur.execute(query_taxonomy, [AsIs(c), v]) # SILVA SSU metadata saved in metadata_ssu table [HACK: eventually information will only be stored in this table] query_taxonomy = "UPDATE metadata_rna SET %s = %s WHERE id = {0}".format( db_genome_id) query_sequence = "UPDATE metadata_rrna_sequences SET %s = %s WHERE id = {0}".format( db_genome_id) metadata_ssu_silva_path = os.path.join( genome_dir, ConfigMetadata.GTDB_SSU_SILVA_OUTPUT_DIR, ConfigMetadata.GTDB_SSU_FILE) metadata_ssu_fna_silva_path = os.path.join( genome_dir, ConfigMetadata.GTDB_SSU_SILVA_OUTPUT_DIR, ConfigMetadata.GTDB_SSU_FNA_FILE) metadata_ssu_silva_summary_file = os.path.join( genome_dir, ConfigMetadata.GTDB_SSU_SILVA_OUTPUT_DIR, ConfigMetadata.GTDB_SSU_SILVA_SUMMARY_FILE) genome_list_taxonomy, ssu_count, ssu_query_id = self._parse_taxonomy_file( metadata_ssu_silva_path, ConfigMetadata.GTDB_SSU_SILVA_PREFIX, metadata_ssu_silva_summary_file) if genome_list_taxonomy: for c, v in genome_list_taxonomy: try: if "blast_subject_id" not in c: v = float(v) self.cur.execute(query_taxonomy, [AsIs(c), v]) except: self.cur.execute(query_taxonomy, [AsIs(c), v]) if ssu_query_id is not None: genome_list_sequence = self._parse_sequence_file( metadata_ssu_fna_silva_path, ConfigMetadata.GTDB_SSU_SILVA_PREFIX, ssu_query_id) for c, v in genome_list_sequence: self.cur.execute(query_sequence, [AsIs(c), v]) # SILVA LSU metadata saved in metadata_ssu table [HACK: eventually information will only be stored in this table] query_taxonomy = "UPDATE metadata_rna SET %s = %s WHERE id = {0}".format( db_genome_id) metadata_lsu_silva_path = os.path.join( genome_dir, ConfigMetadata.GTDB_LSU_SILVA_OUTPUT_DIR, ConfigMetadata.GTDB_LSU_FILE) metadata_lsu_fna_silva_path = os.path.join( genome_dir, ConfigMetadata.GTDB_LSU_SILVA_OUTPUT_DIR, ConfigMetadata.GTDB_LSU_FNA_FILE) metadata_lsu_silva_summary_file = os.path.join( genome_dir, ConfigMetadata.GTDB_LSU_SILVA_OUTPUT_DIR, ConfigMetadata.GTDB_LSU_SILVA_SUMMARY_FILE) genome_list_taxonomy, lsu_count, lsu_query_id = self._parse_taxonomy_file( metadata_lsu_silva_path, ConfigMetadata.GTDB_LSU_SILVA_PREFIX, metadata_lsu_silva_summary_file) if genome_list_taxonomy: for c, v in genome_list_taxonomy: try: if "blast_subject_id" not in c: v = float(v) self.cur.execute(query_taxonomy, [AsIs(c), v]) except: self.cur.execute(query_taxonomy, [AsIs(c), v]) if lsu_query_id is not None: genome_list_sequence = self._parse_sequence_file( metadata_lsu_fna_silva_path, ConfigMetadata.GTDB_LSU_SILVA_PREFIX, lsu_query_id) for c, v in genome_list_sequence: self.cur.execute(query_sequence, [AsIs(c), v]) query_gene_ssu = "UPDATE metadata_genes SET ssu_count = %s WHERE id = {0}".format( db_genome_id) self.cur.execute(query_gene_ssu, (ssu_count, )) query_gene_lsu = "UPDATE metadata_genes SET lsu_23s_count = %s WHERE id = {0}".format( db_genome_id) self.cur.execute(query_gene_lsu, (lsu_count, )) return True except psycopg2.Error as e: print "error" raise GenomeDatabaseError(e.pgerror) except: print("Unexpected error:", sys.exc_info()[0]) raise