Пример #1
0
    def deleteGenomeList(self, genome_list_ids):
        """Delete genome list and associated genomes.

        Parameters
        ----------
        genome_list_ids : iterable
            Unique identifier of genome lists in database.

        Returns
        -------
        bool
            True if successful.
        """

        for genome_list_id in genome_list_ids:
            try:
                edit_permission = self.permissionToModify(genome_list_id)
                if edit_permission is False:
                    raise GenomeDatabaseError(
                        "Insufficient permissions to delete genome list. Offending list id: {0}"
                        .format(genome_list_id))

                if not confirm(
                        "Are you sure you want to delete {0} lists (this action cannot be undone)"
                        .format(len(genome_list_ids))):
                    raise GenomeDatabaseError("User aborted database action.")

                list_genomes_ids = self.getGenomeIdsFromGenomeListIds(
                    [genome_list_id])
                self.editGenomeList(genome_list_id, list_genomes_ids, 'remove')
            except GenomeDatabaseError as e:
                raise e

        return True
Пример #2
0
    def deleteMarkerSets(self, marker_set_ids):
        """Delete marker set and associated markers.

        Parameters
        ----------
        marker_set_ids : iterable
            Unique identifier of marker sets in database.

        Returns
        -------
        bool
            True if successful.
        """

        for marker_set_id in marker_set_ids:
            try:
                edit_permission = self.permissionToModify(marker_set_id)
                if edit_permission is False:
                    raise GenomeDatabaseError(
                        "Insufficient permissions to delete marker set. Offending marker set id: {0}"
                        .format(marker_set_id))

                if not confirm(
                        "Are you sure you want to delete {0} set(s) (this action cannot be undone)"
                        .format(len(marker_set_ids))):
                    raise GenomeDatabaseError("User aborted database action.")

                list_marker_ids = self.getMarkerIdsFromMarkerSetIds(
                    [marker_set_id])
                self.editMarkerSet(marker_set_id, list_marker_ids, 'remove')
            except GenomeDatabaseError as e:
                raise e

        return True
Пример #3
0
    def getGenomeIdsFromGenomeListIds(self, genome_list_ids):
        '''
        Function: getGenomeIdsFromGenomeListIds
        Given a list of ids, return all the ids of the genomes contained

        :param genome_list_ids: A list of genome list ids whose contents needs to be retrieved.

        Returns:
        A list of all the genome ids contained within the specified genome
        list(s), False on failure.
        '''
        try:

            temp_table_name = generateTempTableName()

            if genome_list_ids:
                try:
                    self.cur.execute("CREATE TEMP TABLE %s (id integer)" %
                                     (temp_table_name, ))
                    query = "INSERT INTO {0} (id) VALUES (%s)".format(
                        temp_table_name)
                    self.cur.executemany(
                        query, [(genome_list_id, )
                                for genome_list_id in genome_list_ids])
                except:
                    raise GenomeDatabaseError(
                        "Unable to process genome list ID: %s" %
                        str(genome_list_id))
            else:
                raise GenomeDatabaseError(
                    "No genome lists given. Cannot retrieve IDs")

            # Find any ids that don't have genome lists
            query = ("SELECT id FROM {0} " + "WHERE id NOT IN ( " +
                     "SELECT id " +
                     "FROM genome_lists)").format(temp_table_name)

            self.cur.execute(query)

            missing_list_ids = []
            for (list_id, ) in self.cur:
                missing_list_ids.append(list_id)

            if missing_list_ids:
                raise GenomeDatabaseError(
                    "Unknown genome list id(s) given. %s" %
                    str(missing_list_ids))

            # Find any genome list ids that we dont have permission to view

            self.cur.execute(
                "SELECT genome_id " + "FROM genome_list_contents " +
                "WHERE list_id in %s", (tuple(genome_list_ids), ))

        except GenomeDatabaseError as e:
            raise e

        return [genome_id for (genome_id, ) in self.cur.fetchall()]
Пример #4
0
    def moveGenomes(self, db_genome_ids):
        """Move genome files into database directory structure.

        This function assumes addGenomes() has been called. It is
        not directly called by addGenomes() as all database
        queries are performed before moving genomes.

        Parameters
        ----------
        db_genome_ids : list
            Unique database identifiers for genomes.
        """

        assert(self.tmp_output_dir)

        # get database genome identifiers
        self.cur.execute("SELECT genomes.id,user_editable, external_id_prefix || '_' || id_at_source as external_id " +
                         "FROM genomes, genome_sources " +
                         "WHERE genome_source_id = genome_sources.id " +
                         "AND genomes.id in %s", (tuple(db_genome_ids),))

        external_id_dict = {}
        for (genome_id, user_editable, external_id) in self.cur:
            if user_editable:
                external_id_dict[genome_id] = external_id

        if len(external_id_dict.keys()) > 0:
            username = None
            if self.currentUser.isRootUser():
                username = self.currentUser.getElevatedFromUsername()
            else:
                username = self.currentUser.getUsername()

            if username is None:
                raise GenomeDatabaseError(
                    "Unable to determine user to add genomes under.")

        gtdb_target_dir = os.path.join(self.genomeCopyDir, username)
        for db_genome_id, external_id in external_id_dict.items():
            tmp_genome_dir = os.path.join(self.tmp_output_dir, external_id)

            genome_target_dir = os.path.join(gtdb_target_dir, external_id)
            if os.path.exists(genome_target_dir):
                raise GenomeDatabaseError(
                    "Genome directory already exists: %s" % genome_target_dir)

            shutil.move(tmp_genome_dir, genome_target_dir)

            self.cur.execute("UPDATE genomes SET fasta_file_location = %s , genes_file_location = %s , genes_file_sha256 = %s WHERE id = %s", (
                os.path.join(
                    username, external_id, external_id + self.genomeFileSuffix),
                os.path.join(
                    username, external_id, self.userAnnotationDir, external_id + self.proteinFileSuffix),
                sha256(os.path.join(genome_target_dir, self.userAnnotationDir, external_id + self.proteinFileSuffix)),
                db_genome_id))

        shutil.rmtree(self.tmp_output_dir)
Пример #5
0
    def addGenomeList(self,
                      genome_id_list,
                      name,
                      description,
                      owner_id=None,
                      private=None):
        """Creates a new genome list in the database.

        Parameters
        ----------
        genome_id_list : list
            A list of genome ids to add to the new list.
        name : str
            Name of the newly created list.
        description : str
            Description of the newly created list.
        owner_id : str
            The id of the user who will own this list.
        private : bool
            Denotes whether this list is public or private.

        Returns
        -------
        int
            Database identifier of newly created genome list.
        """

        try:
            if (owner_id is None):
                if not self.currentUser.isRootUser():
                    raise GenomeDatabaseError(
                        "Only the root user can create root owned lists.")
            else:
                if (not self.currentUser.isRootUser()) and (
                        self.currentUser.getUserId() != owner_id):
                    raise GenomeDatabaseError(
                        "Only the root user may create lists on behalf of other people."
                    )

            query = "INSERT INTO genome_lists (name, description, owned_by_root, owner_id, private) VALUES (%s, %s, %s, %s, %s) RETURNING id"
            self.cur.execute(
                query,
                (name, description, owner_id is None, owner_id, private))
            (genome_list_id, ) = self.cur.fetchone()

            query = "INSERT INTO genome_list_contents (list_id, genome_id) VALUES (%s, %s)"
            self.cur.executemany(query,
                                 [(genome_list_id, x) for x in genome_id_list])
        except GenomeDatabaseError as e:
            raise e

        return genome_list_id
Пример #6
0
    def _addGenomesWorker(self, genomic_files, file_paths, checkm_results_dict,
                          study_id, out_q, progress_queue):
        '''
        The worker function, invoked in a process.

        :param genomic_files: dictionary {genome_id:{checkm_bin_id:value,aa_gene_path:value,fasta_path:value}}
        :param file_paths : dictionary generated from Prodigal
        :param checkm_results_dict: dictionary of checkm results
        :param study_id = study id
        '''
        metadata_mngr = MetadataManager(self.cur, self.currentUser)
        for db_genome_id, values in genomic_files.iteritems():

            self.cur.execute("UPDATE genomes SET study_id = %s WHERE id = %s",
                             (study_id, db_genome_id))

            genome_file_paths = file_paths[db_genome_id]
            output_dir, _file = os.path.split(genome_file_paths["fasta_path"])

            bin_id = values['checkm_bin_id']
            if bin_id not in checkm_results_dict:
                raise GenomeDatabaseError(
                    "Couldn't find CheckM result for bin %s." % bin_id)

            metadata_mngr.addMetadata(db_genome_id,
                                      genome_file_paths["fasta_path"],
                                      genome_file_paths["gff_path"],
                                      checkm_results_dict[bin_id], output_dir)

            progress_queue.put(bin_id)

        out_q.put("True")
        return True
Пример #7
0
    def ExportGenomePaths(self, path):
        '''
        Function: ExportGenomePaths
        Export the full path for all genomes to a csv file

        :param path: Path to the output file
        '''

        try:
            query_tmp = "SELECT id_at_source,external_id_prefix,fasta_file_location FROM genomes,genome_sources WHERE genomes.genome_source_id=genome_sources.id;"
            self.cur.execute(query_tmp)
            with open(path, "w") as f:
                for (id, prefix, file_location) in self.cur:
                    dir_prefix = None
                    if prefix == 'U':
                        dir_prefix = Config.GTDB_GENOME_USR_DIR
                    elif prefix == 'RS':
                        dir_prefix = Config.GTDB_GENOME_RSQ_DIR
                    elif prefix == 'GB':
                        dir_prefix = Config.GTDB_GENOME_GBK_DIR
                    else:
                        raise GenomeDatabaseError(
                            "Unrecognized database prefix: %s" % prefix)
                    f.write("{0}\t{1}\n".format(
                        prefix + "_" + id,
                        os.path.dirname(os.path.join(dir_prefix,
                                                     file_location))))
            print "Export Successful"
        except GenomeDatabaseError as e:
            raise e
Пример #8
0
    def getMarkerIdsFromMarkerSetIds(self, marker_set_ids):
        """Get marker identifiers within specific marker set.

        Parameters
        ----------
        marker_set_ids : iterable
            Identifiers of marker sets.

        Returns
        -------
        list
            Identifier of markers in marker sets.
        """

        self.cur.execute(
            "SELECT id, owner_id, owned_by_root, private " +
            "FROM marker_sets " + "WHERE id in %s ", (tuple(marker_set_ids), ))

        result = self.cur.fetchone()

        if not result:
            raise GenomeDatabaseError(
                "At least one marker set is invalid: %s" % str(marker_set_ids))

        self.cur.execute(
            "SELECT marker_id " + "FROM marker_set_contents " +
            "WHERE set_id in %s ", (tuple(marker_set_ids), ))

        return [marker_id for (marker_id, ) in self.cur.fetchall()]
Пример #9
0
    def importMetadata(self,
                       table=None,
                       field=None,
                       typemeta=None,
                       metafile=None):
        '''
        Function importMetadata
        import one field of Metadata for a list of Genomes

        :param table: Table where the column is located
        :param field: Name of the Column
        :param typemeta: Data type of the column
        :param metafile: TSV file with the format (Genome_id \t Value)
        '''
        try:
            data_list = []
            with open(metafile, 'r') as metaf:
                for line in metaf:
                    data_list.append(tuple(line.strip().split('\t')))
            data_zip = zip(*data_list)
            genome_id = list(data_zip[0])
            meta_value = list(data_zip[1])
            for n, i in enumerate(genome_id):
                new_i = i.split("_", 1)[1]
                genome_id[n] = new_i
            query = "SELECT upsert('{0}','{1}','{2}',%s,%s)".format(
                table, field, typemeta)
            self.cur.execute(query, (genome_id, meta_value))
        except GenomeDatabaseError as e:
            raise e
        except psycopg2.Error as e:
            raise GenomeDatabaseError(e.pgerror)
Пример #10
0
    def printGenomeStats(self, genome_id_list, stat_fields):
        """Print statistics details of genomes.

        Parameters
        ----------
        genome_id_list : iterable
            Unique identifier of genomes in database.

        Returns
        -------
        list
            Column headers.
        list
            Content for each row.
        """

        try:
            if not genome_id_list:
                raise GenomeDatabaseError(
                    "Unable to print genomes. No genomes found.")

            stat_fields = ['id', 'accession'] + stat_fields
            stat_fields_str = ','.join(stat_fields)

            self.cur.execute("SELECT " + stat_fields_str + " FROM metadata_view " +
                             "WHERE id in %s", (tuple(genome_id_list),))

            rows = []
            for d in self.cur:
                rows.append(d[1:])

        except GenomeDatabaseError as e:
            raise e

        return stat_fields[1:], rows
Пример #11
0
    def _hasPermissionToEditGenomes(self, db_genome_ids):
        '''
        Function _hasPermissionToEditGenomes
        Check if a user is entitled to delete genomes.
        Users can delete their own genomes, Admin can delete any genomes

        :param db_genome_ids:list of genomes is to delete

        Return a tuple containing:
        - Boolean returning the state of the function
        - The username currently running the delete function
        -a Dictionary listing the list of genomes and where each genome has saved
            the owner,the Prefix (U,GB or RS) , the relative path

        '''
        try:
            if not db_genome_ids:
                raise GenomeDatabaseError(
                    "Unable to retrieve genome permissions, no genomes given: %s"
                    % str(db_genome_ids))

            self.cur.execute(
                "SELECT gs.external_id_prefix,gs.external_id_prefix || '_'|| genomes.id_at_source, owner_id, username, owned_by_root,fasta_file_location "
                "FROM genomes " +
                "LEFT OUTER JOIN users ON genomes.owner_id = users.id " +
                "LEFT JOIN genome_sources gs ON gs.id = genomes.genome_source_id "
                + "WHERE genomes.id in %s", (tuple(db_genome_ids), ))

            dict_genomes_user = {}
            for (prefix, public_id, owner_id, username, owned_by_root,
                 fasta_path) in self.cur:

                if not self.currentUser.isRootUser():
                    if (owned_by_root
                            or owner_id != self.currentUser.getUserId()):
                        print(
                            "WARNING: Insufficient permissions to edit genome {0}"
                            .format(public_id))
                        print logging.warn(
                            "{0} is trying to delete genome {1} owned by {2}".
                            format(self.currentUser.getUsername(), public_id,
                                   username))
                        return (False, None, None)
                dict_genomes_user[public_id] = {
                    "owner": username,
                    "prefix": prefix,
                    "relative_path": fasta_path
                }

            if self.currentUser.isRootUser():
                current_username = self.currentUser.getElevatedFromUsername()
            else:
                current_username = self.currentUser.getUsername()
        except GenomeDatabaseError as e:
            raise e

        return (True, current_username, dict_genomes_user)
Пример #12
0
    def addUser(self,
                username,
                firstname,
                lastname,
                rolename=None,
                has_root=False):
        try:
            if rolename is None:
                rolename = 'user'

            if (not self.currentUser.isRootUser()):
                if has_root:
                    raise GenomeDatabaseError(
                        "Only the root user may grant root access to new users."
                    )

                if rolename == 'admin':
                    raise GenomeDatabaseError(
                        "Only the root user may create admin accounts.")

                if not (self.currentUser.getRolename() == 'admin'
                        and rolename == 'user'):
                    raise GenomeDatabaseError(
                        "Only admins (and root) can create user accounts.")

            self.cur.execute("SELECT username from users where username = %s",
                             (username, ))

            if len(self.cur.fetchall()) > 0:
                raise GenomeDatabaseError(
                    "User %s already exists in the database." % username)
            self.cur.execute(
                "INSERT into users (username,firstname,lastname, role_id, has_root_login) ("
                + "SELECT %s,%s,%s, id, %s " + "FROM user_roles " +
                "WHERE name = %s)",
                (username, firstname, lastname, has_root, rolename))

        except GenomeDatabaseError as e:
            raise e
        except:
            raise

        return True
Пример #13
0
    def _identifyHeadersCheckM(self, checkm_fh):
        """Parse header information from CheckM file.

        Parameters
        ----------
        checkm_fh : file
            Handle to CheckM file.
        """

        required_headers = {
            "Bin Id": None,
            "Completeness": None,
            "Contamination": None,
            "Marker lineage": None,
            "# genomes": None,
            "# markers": None,
            "# marker sets": None,
            "Strain heterogeneity": None
        }

        # Check the CheckM headers are consistent
        split_headers = checkm_fh.readline().rstrip().split("\t")
        for pos in range(0, len(split_headers)):
            header = split_headers[pos]
            if header not in required_headers:
                continue

            if required_headers[header] is not None:
                raise GenomeDatabaseError(
                    "Seen %s header twice in the CheckM file. Check that the CheckM file is correct: %s."
                    % (header, checkm_fh.name))

            required_headers[header] = pos

        for header, col in required_headers.items():
            if (header is "Completeness"
                    or header is "Contamination") and col is None:
                raise GenomeDatabaseError(
                    "Unable to find %s header in the CheckM file. Check that the CheckM file is correct: %s."
                    % (header, checkm_fh.name))

        return required_headers
Пример #14
0
    def permissionToModify(self, genome_list_id):
        """Check if user has permission to modify genome list.

        Parameters
        ----------
        genome_list_id : int
            Unique identifier of genome list in database.

        Returns
        -------
        bool
            True if has permission, else False.
        """

        try:
            self.cur.execute(
                "SELECT owner_id, owned_by_root " + "FROM genome_lists " +
                "WHERE id = %s ", (genome_list_id, ))

            result = self.cur.fetchone()

            if not result:
                raise GenomeDatabaseError("No genome list with id: %s" %
                                          str(genome_list_id))

            (owner_id, owned_by_root) = result

            if not self.currentUser.isRootUser():
                if owned_by_root or owner_id != self.currentUser.getUserId():
                    return False
            else:
                if not owned_by_root:
                    raise GenomeDatabaseError(
                        "Root user editing of other users lists not yet implemented."
                    )

        except GenomeDatabaseError as e:
            raise e

        return True
Пример #15
0
    def rootLogin(self, username):
        try:
            query = "SELECT id, has_root_login FROM users WHERE username = %s"
            self.cur.execute(query, [username])
            result = self.cur.fetchone()
            self.cur.close()

            if result:
                (_userid, has_root_login) = result
                if not has_root_login:
                    raise GenomeDatabaseError(
                        "You do not have sufficient permissions to logon as the root user."
                    )

                self.currentUser = User.createRootUser(username)
            else:
                raise GenomeDatabaseError("User %s not found." % username)

        except GenomeDatabaseError as e:
            raise e

        return self.currentUser
Пример #16
0
    def _processCheckM(self, checkm_file):
        """Parse information from CheckM file.

        Parameters
        ----------
        checkm_file : str
            Name of file containing CheckM results.

        Returns
        -------
        dict
            CheckM statistics for each genome.
        """

        try:
            checkm_fh = open(checkm_file, "rb")
        except:
            raise GenomeDatabaseError("Cannot open CheckM file: " +
                                      checkm_file)

        required_headers = self._identifyHeadersCheckM(checkm_fh)

        # populate CheckM results dict
        checkm_results_dict = {}

        for line in checkm_fh:
            line = line.rstrip()
            splitline = line.split("\t")

            bin_id = splitline[required_headers["Bin Id"]]
            completeness = splitline[required_headers["Completeness"]]
            contamination = splitline[required_headers["Contamination"]]
            lineage = splitline[required_headers["Marker lineage"]]
            genome_count = splitline[required_headers["# genomes"]]
            marker_count = splitline[required_headers["# markers"]]
            set_count = splitline[required_headers["# marker sets"]]
            heterogeneity = splitline[required_headers["Strain heterogeneity"]]

            checkm_results_dict[bin_id] = {
                "completeness": completeness,
                "contamination": contamination,
                "lineage": lineage,
                "genome_count": genome_count,
                "marker_count": marker_count,
                "set_count": set_count,
                "heterogeneity": heterogeneity
            }

        checkm_fh.close()

        return checkm_results_dict
Пример #17
0
    def createMetadata(self, metadatafile):
        '''
        Function createMetadata
        Create or Update metaddata columns in the database

        :param metadatafile: TSV file listing one new field per line
        Format of the TSV file is new_field \t description \t type \t table
        '''
        try:
            data_dict = {}
            with open(metadatafile, 'r') as metaf:
                for line in metaf:
                    array_line = line.strip().split('\t')
                    if not array_line[3].startswith("metadata_"):
                        raise GenomeDatabaseError(
                            "Only Metadata Tables can be modified")
                    data_dict[array_line[0]] = {
                        "table": array_line[3],
                        "type": array_line[2],
                        "desc": array_line[1]
                    }

            query = "SELECT v.field,v.table from view_list_meta_columns as v"
            self.cur.execute(query)
            all_col_dict = dict(self.cur.fetchall())
            for key, value in data_dict.iteritems():
                if key in all_col_dict:
                    if all_col_dict.get(key) == value['table']:
                        query_comment = "COMMENT ON COLUMN {0}.{1} IS '{2}'".format(
                            value['table'], key, value['desc'])
                        self.cur.execute(query_comment)
                    else:
                        logging.warning(
                            "Column {0} is already presents in the {1} table ."
                            .format(key, all_col_dict.get(key)))
                else:
                    query_add_col = "ALTER TABLE {0} ADD COLUMN {1} {2}".format(
                        value['table'], key, value['type'])
                    self.cur.execute(query_add_col)
                    query_add_comment = "COMMENT ON COLUMN {0}.{1} IS '{2}'".format(
                        value['table'], key, value['desc'])
                    self.cur.execute(query_add_comment)

# ---------- PSQL are not refresh automatically so we need to drop the existing view and recreate it with a new Definition.
            self.cur.execute("SELECT refreshView()")

        except GenomeDatabaseError as e:
            raise e
Пример #18
0
    def editUser(self,
                 username,
                 rolename=None,
                 has_root=None,
                 firstname=None,
                 lastname=None):
        try:
            if (not self.currentUser.isRootUser()):
                raise GenomeDatabaseError(
                    "Only the root user may edit existing accounts.")

            conditional_queries = []
            params = []

            if rolename is not None:
                conditional_queries.append(
                    " role_id = (SELECT id from user_roles where name = %s) ")
                params.append(rolename)

            print has_root

            if has_root is not None:
                conditional_queries.append(" has_root_login = %s ")
                params.append(has_root)

            if firstname is not None:
                conditional_queries.append(" firstname = %s ")
                params.append(firstname)

            if lastname is not None:
                conditional_queries.append(" lastname = %s ")
                params.append(lastname)

            if params:
                self.cur.execute(
                    "UPDATE users " + "SET " + ','.join(conditional_queries) +
                    " "
                    "WHERE username = %s", params + [username])

        except GenomeDatabaseError as e:
            raise e
        except Exception as e:
            raise e

        return True
Пример #19
0
    def printGenomeDetails(self, genome_id_list):
        """Print database details of genomes.

        Parameters
        ----------
        genome_id_list : iterable
            Unique identifier of genomes in database.

        Returns
        -------
        list
            Column headers.
        list
            Content for each row.
        """

        try:
            if not genome_id_list:
                raise GenomeDatabaseError(
                    "Unable to print genomes. No genomes found.")

            columns = "genomes.id, genomes.name, description, owned_by_root, username, " + \
                "external_id_prefix || '_' || id_at_source as external_id, date_added"

            self.cur.execute(
                "SELECT " + columns + " FROM genomes " +
                "LEFT OUTER JOIN users ON genomes.owner_id = users.id " +
                "JOIN genome_sources AS sources ON genome_source_id = sources.id "
                + "AND genomes.id in %s " + "ORDER BY genomes.id ASC",
                (tuple(genome_id_list), ))

            header = ("genome_id", "name", "description", "owner",
                      "data_added")

            rows = []
            for (_genome_id, name, description, owned_by_root, username,
                 external_id, date_added) in self.cur:
                rows.append((external_id, name, description,
                             ("root" if owned_by_root else username),
                             date_added.date()))

        except GenomeDatabaseError as e:
            raise e

        return header, rows
Пример #20
0
    def printGenomeListsDetails(self, genome_list_ids):
        """Print genome list details.

        Parameters
        ----------
        genome_list_ids : iterable
            Unique identifier of genome lists in database.

        Returns
        -------
        list
            Column headers.
        list
            Content for each row.
        """

        try:
            if not genome_list_ids:
                raise GenomeDatabaseError(
                    "Unable to print genome details: No genomes given.")

            self.cur.execute(
                "SELECT lists.id, lists.name, lists.owned_by_root, users.username, count(contents.list_id) "
                + "FROM genome_lists as lists " +
                "LEFT OUTER JOIN users ON lists.owner_id = users.id " +
                "JOIN genome_list_contents as contents ON contents.list_id = lists.id "
                + "WHERE lists.id in %s " +
                "GROUP by lists.id, users.username " +
                "ORDER by lists.display_order asc, lists.id",
                (tuple(genome_list_ids), ))

            header = ("list_id", "name", "owner", "genome_count")

            rows = []
            for (list_id, name, owned_by_root, username,
                 genome_count) in self.cur:
                rows.append(
                    (list_id, name, ("root" if owned_by_root else username),
                     genome_count))

        except GenomeDatabaseError as e:
            raise e

        return header, rows
Пример #21
0
    def printMarkerSetsDetails(self, marker_set_ids):
        """Print marker set details.

        Parameters
        ----------
        genome_list_ids : iterable
            Unique identifier of marker sets in database.

        Returns
        -------
        list
            Column headers.
        list
            Content for each row.
        """
        try:
            if not marker_set_ids:
                raise GenomeDatabaseError(
                    "Unable to print marker set details: No marker sets given."
                )

            self.cur.execute(
                "SELECT sets.id, sets.name, sets.description, sets.owned_by_root, users.username, count(contents.set_id) "
                + "FROM marker_sets as sets " +
                "LEFT OUTER JOIN users ON sets.owner_id = users.id " +
                "JOIN marker_set_contents as contents ON contents.set_id = sets.id "
                + "WHERE sets.id in %s " +
                "GROUP by sets.id, users.username " + "ORDER by sets.id asc ",
                (tuple(marker_set_ids), ))

            header = ("set_id", "name", "description", "owner", "marker_count")

            rows = []
            for (set_id, name, description, owned_by_root, username,
                 marker_count) in self.cur:
                rows.append(
                    (set_id, name, description,
                     ("root" if owned_by_root else username), marker_count))

        except GenomeDatabaseError as e:
            raise e

        return header, rows
Пример #22
0
    def userLogin(self, username):
        try:
            self.cur.execute(
                "SELECT users.id, user_roles.id, user_roles.name "
                "FROM users, user_roles " +
                "WHERE users.role_id = user_roles.id " +
                "AND users.username = %s", (username, ))

            result = self.cur.fetchone()

            if not result:
                raise GenomeDatabaseError("User not found: %s" % username)

            (user_id, role_id, rolename) = result
            self.currentUser = User.createUser(user_id, username, rolename,
                                               role_id)

        except GenomeDatabaseError as e:
            raise e

        return self.currentUser
Пример #23
0
    def printMarkerDetails(self, marker_id_list):
        """Print marker gene details.

        Parameters
        ----------
        marker_id_list : iterable
            Unique identifier of markers in database.

        Returns
        -------
        bool
            True if successful, else False.
        """

        try:
            if not marker_id_list:
                raise GenomeDatabaseError(
                    "Unable to print markers. No markers found.")

            columns = "markers.id, markers.name, description, " + \
                "external_id_prefix || '_' || id_in_database as external_id, size"

            self.cur.execute(
                "SELECT " + columns + " FROM markers " +
                "LEFT OUTER JOIN users ON markers.owner_id = users.id " +
                "JOIN marker_databases AS databases ON marker_database_id = databases.id "
                + "AND markers.id in %s " + "ORDER BY markers.id ASC",
                (tuple(marker_id_list), ))

            # print table
            header = ("Marker ID", "Name", "Description", "Length (aa)")

            rows = []
            for (_marker_id, name, description, external_id, size) in self.cur:
                rows.append((external_id, name, description, size))

        except GenomeDatabaseError as e:
            raise e

        return header, rows
Пример #24
0
    def editMarkerSet(self,
                      marker_set_id,
                      marker_ids=None,
                      operation=None,
                      name=None,
                      description=None,
                      private=None):
        """Edit an existing marker set in the database.

        Parameters
        ----------
        marker_set_id : int
            Identifier of marker set in database.
        batchfile : str
            Filename of batch file describing markers to modify.
        marker_external_ids : list
            List of markers to modify.
        operation : str
            Operation to perform on genome list (add or remove).
        name : str
            Name of the newly created list.
        description : str
            Description of the newly created list.
        private : bool
            Denotes whether this list is public or private.

        Returns
        -------
        bool
            True if successful, else False
        """

        try:
            edit_permission = self.permissionToModify(marker_set_id)
            if edit_permission is None:
                raise GenomeDatabaseError(
                    "Unable to retrieve marker set id for editing. Offending set id: %s"
                    % marker_set_id)
            elif edit_permission is False:
                raise GenomeDatabaseError(
                    "Insufficient permissions to edit this marker set. Offending set id: %s"
                    % marker_set_id)

            update_query = []
            params = []

            if name is not None:
                update_query.append("name = %s")
                params.append(name)

            if description is not None:
                update_query.append("description = %s")
                params.append(description)

            if private is not None:
                update_query.append("private = %s")
                params.append(private)

            if params:
                self.cur.execute(
                    "UPDATE marker_sets SET " + ",".join(update_query) +
                    " WHERE id = %s", params + [marker_set_id])

            temp_table_name = Tools.generateTempTableName()

            if operation is not None:

                if len(marker_ids) == 0:
                    raise GenomeDatabaseError(
                        "No marker ids given to perform '%s' operation." %
                        operation)

                self.cur.execute("CREATE TEMP TABLE %s (id integer)" %
                                 (temp_table_name, ))
                query = "INSERT INTO {0} (id) VALUES (%s)".format(
                    temp_table_name)
                self.cur.executemany(query, [(x, ) for x in marker_ids])

                if operation == 'add':
                    query = (
                        "INSERT INTO marker_set_contents (set_id, marker_id) "
                        + "SELECT %s, id FROM {0} " + "WHERE id NOT IN ( " +
                        "SELECT marker_id " + "FROM marker_set_contents " +
                        "WHERE set_id = %s)").format(temp_table_name)
                    self.cur.execute(query, (marker_set_id, marker_set_id))
                elif operation == 'remove':
                    query = ("DELETE FROM marker_set_contents " +
                             "WHERE set_id = %s " + "AND marker_id IN ( " +
                             "SELECT id " +
                             "FROM {0})").format(temp_table_name)
                    self.cur.execute(query, [marker_set_id])

                    query_is_empty = (
                        "SELECT count(msc.marker_id) from marker_sets as ms " +
                        "LEFT JOIN  marker_set_contents as msc on msc.set_id = ms.id "
                        + "WHERE ms.id = {0} " +
                        "GROUP BY ms.id").format(marker_set_id)
                    self.cur.execute(query_is_empty)

                    count = self.cur.fetchone()
                    if count[0] == 0:
                        # We delete the list because it's empty
                        query_del_set = (
                            "DELETE FROM marker_sets WHERE id = {0} "
                        ).format(marker_set_id)
                        self.cur.execute(query_del_set)

                    # we deletethe aligned markers not associated with any marker_sets
                    query_get_distinct_mkrs = "SELECT DISTINCT marker_id FROM marker_set_contents"
                    self.cur.execute(query_get_distinct_mkrs)
                    processed_results = [
                        genome_id for (genome_id, ) in self.cur.fetchall()
                    ]
                    mkrs_to_del = [
                        x for x in marker_ids if x not in processed_results
                    ]

                    query_del_aligned_mkrs = "DELETE FROM aligned_markers where marker_id in %s"
                    self.cur.execute(query_del_aligned_mkrs,
                                     (tuple(mkrs_to_del), ))

                else:
                    raise GenomeDatabaseError(
                        "Unknown marker set edit operation: %s" % operation)

        except GenomeDatabaseError as e:
            raise e

        return True
Пример #25
0
    def filterGenomes(self, marker_ids, genome_ids, quality_threshold,
                      quality_weight, comp_threshold, cont_threshold,
                      min_perc_aa, min_rep_perc_aa, taxa_filter,
                      guaranteed_taxa_filter, genomes_to_exclude,
                      guaranteed_ids, rep_ids, directory, prefix):
        """Filter genomes based on provided criteria.

        Parameters
        ----------

        Returns
        -------
        set
            Database identifiers of retained genomes.
        """

        if not os.path.exists(directory):
            os.makedirs(directory)

        # get mapping from db genome IDs to external IDs
        genome_mngr = GenomeManager(self.cur, self.currentUser)
        external_ids = genome_mngr.genomeIdsToExternalGenomeIds(genome_ids)
        filter_genome_file = os.path.join(directory,
                                          prefix + '_filtered_genomes.tsv')
        fout_filtered = open(filter_genome_file, 'w')

        self.logger.info('Filtering initial set of %d genomes.' %
                         len(genome_ids))

        extra_guaranteed_ids = [
            x for x in guaranteed_ids if x not in genome_ids
        ]
        if len(extra_guaranteed_ids) > 0:
            self.logger.warning(
                'Identified {0} guaranteed genomes absent from specified input genomes (Those genomes will not appear in the final tree).'
                .format(len(extra_guaranteed_ids)))
            guaranteed_ids = [x for x in guaranteed_ids if x in genome_ids]
        self.logger.info(
            'Identified %d genomes to be excluded from filtering.' %
            len(guaranteed_ids))

        # for all markers, get the expected marker size
        self.cur.execute(
            "SELECT markers.id, markers.name, description, id_in_database, size, external_id_prefix "
            + "FROM markers, marker_databases " + "WHERE markers.id in %s "
            "AND markers.marker_database_id = marker_databases.id "
            "ORDER by external_id_prefix ASC, id_in_database ASC",
            (tuple(marker_ids), ))

        chosen_markers = dict()
        chosen_markers_order = []

        total_alignment_len = 0
        for marker_id, marker_name, marker_description, id_in_database, size, external_id_prefix in self.cur:
            chosen_markers[marker_id] = {
                'external_id_prefix': external_id_prefix,
                'name': marker_name,
                'description': marker_description,
                'id_in_database': id_in_database,
                'size': size
            }
            chosen_markers_order.append(marker_id)
            total_alignment_len += size

        # filter genomes based on taxonomy
        genomes_to_retain = genome_ids
        if taxa_filter:
            new_genomes_to_retain = self._taxa_filter(taxa_filter,
                                                      genomes_to_retain,
                                                      guaranteed_ids,
                                                      retain_guaranteed=True)
            for genome_id in genomes_to_retain - new_genomes_to_retain:
                rep_str = 'Representative' if genome_id in rep_ids else ''
                fout_filtered.write(
                    '%s\t%s\t%s\n' %
                    (external_ids[genome_id],
                     'Filtered on taxonomic affiliation.', rep_str))

            genomes_to_retain = new_genomes_to_retain

        if guaranteed_taxa_filter:
            new_genomes_to_retain = self._taxa_filter(guaranteed_taxa_filter,
                                                      genomes_to_retain,
                                                      guaranteed_ids,
                                                      retain_guaranteed=False)
            for genome_id in genomes_to_retain - new_genomes_to_retain:
                rep_str = 'Representative' if genome_id in rep_ids else ''
                fout_filtered.write(
                    '%s\t%s\t%s\n' %
                    (external_ids[genome_id],
                     'Filtered on guaranteed taxonomic affiliation.', rep_str))

            genomes_to_retain = new_genomes_to_retain

        # find genomes based on completeness, contamination, or genome quality
        self.logger.info(
            'Filtering genomes with completeness <%.1f%%, contamination >%.1f%%, or quality <%.1f%% (weight = %.1f).'
            % (comp_threshold, cont_threshold, quality_threshold,
               quality_weight))
        filtered_genomes = self._filterOnGenomeQuality(genomes_to_retain,
                                                       quality_threshold,
                                                       quality_weight,
                                                       comp_threshold,
                                                       cont_threshold)

        # sanity check representatives are not of poor quality
        final_filtered_genomes = set()
        for genome_id, quality in filtered_genomes.iteritems():
            if genome_id not in guaranteed_ids:
                if genome_id in rep_ids:
                    self.logger.warning(
                        'Retaining representative genome %s despite poor estimated quality (comp=%.1f%%, cont=%.1f%%).'
                        % (external_ids[genome_id], quality[0], quality[1]))
                else:
                    final_filtered_genomes.add(genome_id)
                    fout_filtered.write(
                        '%s\t%s\t%.2f\t%.2f\n' %
                        (external_ids[genome_id],
                         'Filtered on quality (completeness, contamination).',
                         quality[0], quality[1]))

        self.logger.info(
            'Filtered %d genomes based on completeness, contamination, and quality.'
            % len(final_filtered_genomes))

        genomes_to_retain -= final_filtered_genomes

        # filter genomes explicitly specified for exclusion
        if genomes_to_exclude:
            for genome_id in genomes_to_exclude:
                if genome_id in external_ids:
                    fout_filtered.write('%s\t%s\n' %
                                        (external_ids[genome_id],
                                         'Explicitly marked for exclusion.'))

            conflicting_genomes = guaranteed_ids.intersection(
                genomes_to_exclude)
            if conflicting_genomes:
                raise GenomeDatabaseError(
                    'Genomes marked for both retention and exclusion, e.g.: %s'
                    % conflicting_genomes.pop())

            new_genomes_to_retain = genomes_to_retain.difference(
                genomes_to_exclude)
            self.logger.info(
                'Filtered %d genomes explicitly indicated for exclusion.' %
                (len(genomes_to_retain) - len(new_genomes_to_retain)))
            genomes_to_retain = new_genomes_to_retain

        # filter genomes with insufficient number of amino acids in MSA
        self.logger.info(
            'Filtering genomes with insufficient amino acids in the MSA.')
        filter_on_aa = set()
        for genome_id in genomes_to_retain:
            aligned_marker_query = (
                "SELECT sequence, multiple_hits,hit_number,unique_genes " +
                "FROM aligned_markers " + "WHERE genome_id = %s " +
                "AND sequence is NOT NULL " + "AND marker_id IN %s")

            self.cur.execute(aligned_marker_query,
                             (genome_id, tuple(marker_ids)))

            total_aa = 0
            for sequence, multiple_hits, hit_number, unique_genes in self.cur:
                if not multiple_hits:
                    total_aa += len(sequence) - sequence.count('-')
                elif unique_genes == 1:
                    total_aa += len(sequence) - sequence.count('-')

            # should retain guaranteed genomes unless they have zero amino
            # acids in MSA
            if genome_id in guaranteed_ids:
                if total_aa != 0:
                    continue
                else:
                    self.logger.warning(
                        'Filtered guaranteed genome %s with zero amino acids in MSA.'
                        % external_ids[genome_id])

            perc_alignment = total_aa * 100.0 / total_alignment_len
            if perc_alignment < min_perc_aa:
                rep_str = ''
                if genome_id in rep_ids:
                    if perc_alignment < min_rep_perc_aa:
                        rep_str = 'Representative'
                        self.logger.warning(
                            'Filtered representative genome %s due to lack of aligned amino acids (%.1f%%).'
                            % (external_ids[genome_id], perc_alignment))
                    else:
                        self.logger.warning(
                            'Retaining representative genome %s despite small numbers of aligned amino acids (%.1f%%).'
                            % (external_ids[genome_id], perc_alignment))
                        continue

                filter_on_aa.add(genome_id)
                fout_filtered.write('%s\t%s\t%d\t%.1f\t%s\n' % (
                    external_ids[genome_id],
                    'Insufficient number of amino acids in MSA (total AA, % alignment length)',
                    total_aa, perc_alignment, rep_str))

        fout_filtered.close()

        self.logger.info(
            'Filtered %d genomes with insufficient amino acids in the MSA.' %
            len(filter_on_aa))

        genomes_to_retain.difference_update(filter_on_aa)
        self.logger.info('Producing tree data for %d genomes.' %
                         len(genomes_to_retain))

        good_genomes_file = os.path.join(directory,
                                         prefix + '_good_genomes.tsv')
        good_genomes = open(good_genomes_file, 'w')
        for item in genomes_to_retain:
            good_genomes.write("{0}\n".format(item))
        good_genomes.close()

        return (genomes_to_retain, chosen_markers_order, chosen_markers)
Пример #26
0
    def runTreeWeightedExceptions(self, path, comp, conta, qweight, qt):
        '''
        Function: runTreeWeightedException
        Export list of NCBI genomes that do comply the filter criteria but are of interest.

        :param path: Path to the output file
        '''
        try:
            if (not self.currentUser.isRootUser()):
                raise GenomeDatabaseError(
                    "Only the root user can run this command")
                return False
            self.cur.execute(
                "SELECT id,mt.ncbi_taxonomy FROM genomes g " +
                "LEFT JOIN metadata_genes mg USING (id) " +
                "LEFT JOIN metadata_taxonomy mt  USING (id) " +
                "LEFT JOIN metadata_ncbi mn  USING (id) " +
                "WHERE g.genome_source_id IN (2,3) and (mt.gtdb_genome_representative is not NULL or "
                +
                "(mt.gtdb_genome_representative is NULL and mg.checkm_completeness > %s and mg.checkm_contamination < %s "
                +
                "and mg.checkm_completeness-%s*mg.checkm_contamination > %s)) and mt.ncbi_taxonomy is not NULL",
                (comp, conta, qweight, qt))
            print self.cur.mogrify(
                "SELECT id,mt.ncbi_taxonomy FROM genomes g " +
                "LEFT JOIN metadata_genes mg USING (id) " +
                "LEFT JOIN metadata_taxonomy mt  USING (id) " +
                "LEFT JOIN metadata_ncbi mn  USING (id) " +
                "WHERE g.genome_source_id IN (2,3) and (mt.gtdb_genome_representative is not NULL or "
                +
                "(mt.gtdb_genome_representative is NULL and mg.checkm_completeness > %s and mg.checkm_contamination < %s "
                +
                "and mg.checkm_completeness-%s*mg.checkm_contamination > %s)) and mt.ncbi_taxonomy is not NULL",
                (comp, conta, qweight, qt))

            processed_results = zip(*self.cur)
            existing_id = processed_results[0]
            existing_taxonomy = processed_results[1]
            order_list = [x.split(';')[3] for x in existing_taxonomy]
            self.cur.execute(
                "SELECT g.id,g.name,mg.checkm_completeness,mg.checkm_contamination,mt.ncbi_taxonomy,mnuc.genome_size,(mg.checkm_completeness-4*mg.checkm_contamination) as quality_threshold,mn.ncbi_organism_name "
                + "FROM genomes g " +
                "LEFT JOIN metadata_genes mg USING (id) " +
                "LEFT JOIN metadata_ncbi mn  USING (id) " +
                "LEFT JOIN metadata_nucleotide mnuc  USING (id) " +
                "LEFT JOIN metadata_taxonomy mt  USING (id) " +
                "WHERE g.genome_source_id IN (2,3) and " +
                "(mg.checkm_completeness > %s and  mg.checkm_contamination < %s "
                +
                "and mg.checkm_completeness-4*mg.checkm_contamination > %s) and mt.ncbi_taxonomy is not NULL and g.id not in %s",
                (DefaultValues.EXCEPTION_FILTER_ONE_CHECKM_COMPLETENESS,
                 DefaultValues.EXCEPTION_FILTER_ONE_CHECKM_CONTAMINATION,
                 DefaultValues.EXCEPTION_FILTER_ONE_QUALITY_THRESHOLD,
                 existing_id))

            dict_except_order = {}
            for (gid, name, compl, conta, ncbitax, size, qual,
                 orga) in self.cur:
                if ncbitax.split(';')[3] != 'o__':
                    if self._checkTaxonomyUniqueness(ncbitax, order_list):
                        if self._checkTaxonomyUniqueness(
                                ncbitax, dict_except_order):
                            dict_except_order[ncbitax.split(';')[3]] = {
                                'quality':
                                float(qual),
                                'id':
                                gid,
                                'full_info': [
                                    name, compl, conta, ncbitax, size, qual,
                                    orga, 'First'
                                ]
                            }
                        else:
                            if dict_except_order.get(ncbitax.split(
                                    ';')[3]).get('quality') < float(qual):
                                dict_except_order[ncbitax.split(';')[3]] = {
                                    'quality':
                                    float(qual),
                                    'id':
                                    gid,
                                    'full_info': [
                                        name, compl, conta, ncbitax, size,
                                        qual, orga, 'First'
                                    ]
                                }
                else:
                    dict_except_order[gid] = {
                        'quality':
                        float(qual),
                        'id':
                        gid,
                        'full_info': [
                            name, compl, conta, ncbitax, size, qual, orga,
                            'unknown order'
                        ]
                    }

            fh = open(path, "w")
            fh.write(
                "Name,CheckM_Completeness,CheckM_Contamination,NCBI_Taxonomy,Genome_size,Quality_Threshold,Organism_name,Filter_passed\n"
            )
            for _k, item in dict_except_order.iteritems():
                fh.write(",".join(str(v)
                                  for v in item.get('full_info')) + "\n")
            fh.close()

        except GenomeDatabaseError as e:
            raise e
        return True
Пример #27
0
    def runSanityCheck(self):
        try:
            if (not self.currentUser.isRootUser()):
                raise GenomeDatabaseError(
                    "Only the root user can run this command")

            # validate type strains
            self.logger.info(
                'Validating 5S, 16S, and 23S count and gene length data.')
            self._validateSSU_LSU()

            # validate type strains
            self.logger.info('Validating type strain.')
            self._validateTypeStrains()

            # validate MIMAG assignments
            self.logger.info('Validating MIMAG assignments.')
            self._validateMIMAG()

            # check if the representatives are still in the database
            query = ("SELECT id FROM genomes where genome_source_id in (2,3)")
            self.cur.execute(query)
            ncbi_ids = [gid for (gid, ) in self.cur]

            query = ("SELECT id,id_at_source FROM genomes")
            self.cur.execute(query)
            raw_ids = [(gid, source_id) for (gid, source_id) in self.cur]
            all_ids, all_source_ids = zip(*raw_ids)
            dict_all_ids = {k: v for (k, v) in raw_ids}

            query = (
                "SELECT distinct(gtdb_genome_representative) from metadata_taxonomy where gtdb_genome_representative is not NULL"
            )
            self.cur.execute(query)
            representatives = [
                self._chompRecord(record) for (record, ) in self.cur
            ]

            for representative in representatives:
                if representative not in all_source_ids:
                    print "REPRESENTATIVE {0} has been removed from the database".format(
                        representative)

            query = (
                "SELECT id,protein_count,ncbi_submitter from metadata_genes LEFT JOIN metadata_ncbi using (id) WHERE id in (SELECT id from genomes where genome_source_id in (2,3));"
            )
            self.cur.execute(query)
            dict_meta_ncbi = {
                gid: {
                    "count": count,
                    "submitter": project
                }
                for (gid, count, project) in self.cur
            }

            for ncbi_genome in ncbi_ids:
                if ncbi_genome not in dict_meta_ncbi:
                    print "{0} has no metadata in metadata_ncbi".format(
                        dict_all_ids[ncbi_genome])
                else:
                    if dict_meta_ncbi[ncbi_genome][
                            "count"] is None or dict_meta_ncbi[ncbi_genome][
                                "count"] == '' or dict_meta_ncbi[ncbi_genome][
                                    "count"] == 0:
                        print "{0} protein_count value in metadata_nucleotide is {1}".format(
                            dict_all_ids[ncbi_genome],
                            dict_meta_ncbi[ncbi_genome]["count"])
                    if dict_meta_ncbi[ncbi_genome][
                            "submitter"] is None or dict_meta_ncbi[
                                ncbi_genome]["submitter"] == '':
                        print "{0} ncbi_submitter value in metadata_ncbi is {1}".format(
                            dict_all_ids[ncbi_genome],
                            dict_meta_ncbi[ncbi_genome]["submitter"])

            query = (
                "SELECT id,checkm_completeness,protein_count from metadata_genes"
            )
            self.cur.execute(query)
            dict_meta_genes = {
                gid: {
                    "checkm": checkm,
                    "protein_count": count
                }
                for (gid, checkm, count) in self.cur
            }

            for genome in all_ids:
                if genome not in dict_meta_genes:
                    print "{0} has no metadata in metadata_genes".format(
                        dict_all_ids[genome])
                else:
                    if dict_meta_genes[genome][
                            "checkm"] is None or dict_meta_genes[genome][
                                "checkm"] == '':
                        print "{0} checkm_completeness value in metadata_genes is {1}".format(
                            dict_all_ids[genome],
                            dict_meta_genes[genome]["checkm"])
                    if dict_meta_genes[genome][
                            "protein_count"] is None or dict_meta_genes[genome][
                                "protein_count"] == '' or dict_meta_genes[
                                    genome]["protein_count"] == 0:
                        print "{0} protein_count value in metadata_genes is {1}".format(
                            dict_all_ids[genome],
                            dict_meta_genes[genome]["protein_count"])

            query = ("SELECT id,gc_count from metadata_nucleotide")
            self.cur.execute(query)
            dict_meta_nuc = {gid: {"gc": gc} for (gid, gc) in self.cur}

            for genome in all_ids:
                if genome not in dict_meta_nuc:
                    print "{0} has no metadata in metadata_nucleotide".format(
                        dict_all_ids[genome])
                else:
                    if dict_meta_nuc[genome]["gc"] is None or dict_meta_nuc[
                            genome]["gc"] == '' or dict_meta_nuc[genome][
                                "gc"] == 0:
                        print "{0} gc_count value in metadata_nucleotide is {1}".format(
                            dict_all_ids[genome], dict_meta_nuc[genome]["gc"])

        except GenomeDatabaseError as e:
            raise e

        return True
Пример #28
0
    def deleteGenomes(self, batchfile=None, db_genome_ids=None, reason=None):
        '''
        Delete Genomes
        Returns True for success or False for fail

        Parameters:
        :param batchfile: text file listing a range of ids to delete
        :param db_genome_ids: a list of ids can be written directly in the command line
        '''

        self._loggerSetup()

        try:
            if db_genome_ids is False:
                raise GenomeDatabaseError(
                    "Unable to delete genomes. Unable to retrieve genome ids.")

            # restrict deletion to genomes owned by user
            has_permission, username, genomes_owners = self._hasPermissionToEditGenomes(
                db_genome_ids)

            if has_permission is None:
                raise GenomeDatabaseError(
                    "Unable to delete genomes. Unable to retrieve permissions for genomes."
                )

            if has_permission is False:
                raise GenomeDatabaseError(
                    "Unable to delete genomes. Insufficient permissions.")

            if db_genome_ids:
                if not confirm(
                        "Are you sure you want to delete %i genomes (this action cannot be undone)"
                        % len(db_genome_ids)):
                    raise GenomeDatabaseError("User aborted database action.")

                self.cur.execute(
                    "DELETE FROM aligned_markers " + "WHERE genome_id IN %s ",
                    (tuple(db_genome_ids), ))

                self.cur.execute(
                    "DELETE FROM genome_list_contents " +
                    "WHERE genome_id IN %s", (tuple(db_genome_ids), ))

                # Deletion of metadata

                self.cur.execute(
                    "DELETE FROM metadata_genes " + "WHERE id IN %s",
                    (tuple(db_genome_ids), ))
                self.cur.execute(
                    "DELETE FROM metadata_ncbi " + "WHERE id IN %s",
                    (tuple(db_genome_ids), ))
                self.cur.execute(
                    "DELETE FROM metadata_nucleotide " + "WHERE id IN %s",
                    (tuple(db_genome_ids), ))
                self.cur.execute(
                    "DELETE FROM metadata_taxonomy " + "WHERE id IN %s",
                    (tuple(db_genome_ids), ))
                self.cur.execute(
                    "DELETE FROM metadata_rna " + "WHERE id IN %s",
                    (tuple(db_genome_ids), ))
                self.cur.execute(
                    "DELETE FROM metadata_sequence " + "WHERE id IN %s",
                    (tuple(db_genome_ids), ))

                self.cur.execute("DELETE FROM genomes " + "WHERE id IN %s",
                                 (tuple(db_genome_ids), ))

                self.cur.execute(
                    "UPDATE metadata_taxonomy set gtdb_genome_representative = NULL where  "
                    + "gtdb_genome_representative in %s",
                    (tuple(genomes_owners.keys()), ))

                for genome, info in genomes_owners.iteritems():
                    if str(username) != str(info.get("owner")):
                        logging.info(
                            '''Genome {0} has been deleted by {1} for the following reason '{2}'
                                          WARNING: {1} is not the owner of this {0} (real owner {3} )
                                          {0} needs to be moved manually to the deprecated folder'''
                            .format(genome, username, reason,
                                    info.get("owner")))
                    else:
                        if info.get("prefix") is "U":
                            target = os.path.dirname(
                                os.path.join(self.deprecatedUserDir,
                                             info.get("relative_path")))
                        elif info.get("prefix") is "GB":
                            target = os.path.join(self.deprecatedGBKDir,
                                                  info.get("relative_path"))
                        elif info.get("prefix") is "RS":
                            target = os.path.join(self.deprecatedRSQDir,
                                                  info.get("relative_path"))
                        make_sure_path_exists(target)
                        os.rename(
                            os.path.dirname(
                                Tools.fastaPathGenerator(
                                    info.get("relative_path"),
                                    info.get("prefix"))), target)
                        logging.info(
                            "Genome {0} has been deleted by {1} for the following reason '{2}'"
                            .format(genome, username, reason))
        except GenomeDatabaseError as e:
            raise e

        return True
Пример #29
0
    def editGenomeList(self,
                       genome_list_id,
                       genome_ids=None,
                       operation=None,
                       name=None,
                       description=None,
                       private=None):
        """Edit an existing genome list in the database.

        Parameters
        ----------
        genome_list_id : int
            Identifier of genome list in database.
        genome_ids : list
            A list of genome ids to be modified.
        operation : str
            Operation to perform on genome list (add or remove).
        name : str
            Name of the newly created list.
        description : str
            Description of the newly created list.
        private : bool
            Denotes whether this list is public or private.

        Returns
        -------
        bool
            True if successful, else False
        """

        try:
            edit_permission = self.permissionToModify(genome_list_id)
            if edit_permission is None:
                raise GenomeDatabaseError(
                    "Unable to retrieve genome list id for editing. Offending list id: %s"
                    % genome_list_id)
            if edit_permission is False:
                raise GenomeDatabaseError(
                    "Insufficient permissions to edit this genome list. Offending list id: %s"
                    % genome_list_id)

            update_query = []
            params = []

            if name is not None:
                update_query.append("name = %s")
                params.append(name)

            if description is not None:
                update_query.append("description = %s")
                params.append(description)

            if private is not None:
                update_query.append("private = %s")
                params.append(private)

            if params:
                self.cur.execute(
                    "UPDATE genome_lists SET " + ",".join(update_query) +
                    " WHERE id = %s", params + [genome_list_id])

            temp_table_name = generateTempTableName()

            if operation is not None:
                if len(genome_ids) == 0:
                    raise GenomeDatabaseError(
                        "No genome ids given to perform '%s' operation." %
                        operation)

                self.cur.execute("CREATE TEMP TABLE %s (id integer)" %
                                 (temp_table_name, ))
                query = "INSERT INTO {0} (id) VALUES (%s)".format(
                    temp_table_name)
                self.cur.executemany(query, [(x, ) for x in genome_ids])

                if operation == 'add':
                    query = (
                        "INSERT INTO genome_list_contents (list_id, genome_id) "
                        + "SELECT %s, id FROM {0} " + "WHERE id NOT IN ( " +
                        "SELECT genome_id " + "FROM genome_list_contents " +
                        "WHERE list_id = %s)").format(temp_table_name)
                    self.cur.execute(query, (genome_list_id, genome_list_id))
                elif operation == 'remove':
                    query = ("DELETE FROM genome_list_contents " +
                             "WHERE list_id = %s " + "AND genome_id IN ( " +
                             "SELECT id " +
                             "FROM {0})").format(temp_table_name)
                    self.cur.execute(query, [genome_list_id])

                    query_is_empty = (
                        "SELECT count(glc.genome_id) from genome_lists as gl "
                        +
                        "LEFT JOIN  genome_list_contents as glc on glc.list_id = gl.id "
                        + "WHERE gl.id = {0} " +
                        "GROUP BY gl.id").format(genome_list_id)
                    self.cur.execute(query_is_empty)

                    count = self.cur.fetchone()

                    if count[0] == 0:
                        # We delete the list because it's empty
                        query_del_list = (
                            "DELETE FROM genome_lists WHERE id = {0} "
                        ).format(genome_list_id)
                        self.cur.execute(query_del_list)

                else:
                    raise GenomeDatabaseError(
                        "Unknown genome set edit operation: %s" % operation)
        except GenomeDatabaseError as e:
            raise e

        return True
Пример #30
0
    def _storeMetadata(self, db_genome_id, genome_dir):
        """Parse metadata files for genome and store in database.

        Parameters
        ----------
        db_genome_id : str
            Unique database identifier of genome.
        genome_dir : str
            Directory containing metadata files to parse.
        """
        try:
            # nucleotide metadata
            metadata_nt_path = os.path.join(genome_dir,
                                            ConfigMetadata.GTDB_NT_FILE)
            genome_list_nt = [
                tuple(line.rstrip().split('\t'))
                for line in open(metadata_nt_path)
            ]
            query_nt = "UPDATE metadata_nucleotide SET %s = %s WHERE id = {0}".format(
                db_genome_id)
            for c, v in genome_list_nt:
                try:
                    v = float(v)
                    self.cur.execute(query_nt, [AsIs(c), v])
                except:
                    self.cur.execute(query_nt, [AsIs(c), v])

            # protein metadata
            metadata_gene_path = os.path.join(genome_dir,
                                              ConfigMetadata.GTDB_GENE_FILE)
            genome_list_gene = [
                tuple(line.rstrip().split('\t'))
                for line in open(metadata_gene_path)
            ]
            query_gene = "UPDATE metadata_genes SET %s = %s WHERE id = {0}".format(
                db_genome_id)
            for c, v in genome_list_gene:
                try:
                    v = float(v)
                    self.cur.execute(query_gene, [AsIs(c), v])
                except:
                    self.cur.execute(query_gene, [AsIs(c), v])

            # Greengenes SSU metadata
            query_taxonomy = "UPDATE metadata_rna SET %s = %s WHERE id = {0}".format(
                db_genome_id)
            query_sequence = "UPDATE metadata_rrna_sequences SET %s = %s WHERE id = {0}".format(
                db_genome_id)
            metadata_ssu_gg_path = os.path.join(
                genome_dir, ConfigMetadata.GTDB_SSU_GG_OUTPUT_DIR,
                ConfigMetadata.GTDB_SSU_FILE)
            genome_list_taxonomy, _ssu_count, ssu_query_id = self._parse_taxonomy_file(
                metadata_ssu_gg_path, ConfigMetadata.GTDB_SSU_GG_PREFIX)
            if genome_list_taxonomy:
                for c, v in genome_list_taxonomy:
                    try:
                        if "blast_subject_id" not in c:
                            v = float(v)
                        self.cur.execute(query_taxonomy, [AsIs(c), v])
                    except:
                        self.cur.execute(query_taxonomy, [AsIs(c), v])

            # SILVA SSU metadata saved in metadata_ssu table [HACK: eventually information will only be stored in this table]
            query_taxonomy = "UPDATE metadata_rna SET %s = %s WHERE id = {0}".format(
                db_genome_id)
            query_sequence = "UPDATE metadata_rrna_sequences SET %s = %s WHERE id = {0}".format(
                db_genome_id)
            metadata_ssu_silva_path = os.path.join(
                genome_dir, ConfigMetadata.GTDB_SSU_SILVA_OUTPUT_DIR,
                ConfigMetadata.GTDB_SSU_FILE)
            metadata_ssu_fna_silva_path = os.path.join(
                genome_dir, ConfigMetadata.GTDB_SSU_SILVA_OUTPUT_DIR,
                ConfigMetadata.GTDB_SSU_FNA_FILE)
            metadata_ssu_silva_summary_file = os.path.join(
                genome_dir, ConfigMetadata.GTDB_SSU_SILVA_OUTPUT_DIR,
                ConfigMetadata.GTDB_SSU_SILVA_SUMMARY_FILE)

            genome_list_taxonomy, ssu_count, ssu_query_id = self._parse_taxonomy_file(
                metadata_ssu_silva_path, ConfigMetadata.GTDB_SSU_SILVA_PREFIX,
                metadata_ssu_silva_summary_file)
            if genome_list_taxonomy:
                for c, v in genome_list_taxonomy:
                    try:
                        if "blast_subject_id" not in c:
                            v = float(v)
                        self.cur.execute(query_taxonomy, [AsIs(c), v])
                    except:
                        self.cur.execute(query_taxonomy, [AsIs(c), v])
                if ssu_query_id is not None:
                    genome_list_sequence = self._parse_sequence_file(
                        metadata_ssu_fna_silva_path,
                        ConfigMetadata.GTDB_SSU_SILVA_PREFIX, ssu_query_id)
                    for c, v in genome_list_sequence:
                        self.cur.execute(query_sequence, [AsIs(c), v])

            # SILVA LSU metadata saved in metadata_ssu table [HACK: eventually information will only be stored in this table]
            query_taxonomy = "UPDATE metadata_rna SET %s = %s WHERE id = {0}".format(
                db_genome_id)
            metadata_lsu_silva_path = os.path.join(
                genome_dir, ConfigMetadata.GTDB_LSU_SILVA_OUTPUT_DIR,
                ConfigMetadata.GTDB_LSU_FILE)
            metadata_lsu_fna_silva_path = os.path.join(
                genome_dir, ConfigMetadata.GTDB_LSU_SILVA_OUTPUT_DIR,
                ConfigMetadata.GTDB_LSU_FNA_FILE)
            metadata_lsu_silva_summary_file = os.path.join(
                genome_dir, ConfigMetadata.GTDB_LSU_SILVA_OUTPUT_DIR,
                ConfigMetadata.GTDB_LSU_SILVA_SUMMARY_FILE)
            genome_list_taxonomy, lsu_count, lsu_query_id = self._parse_taxonomy_file(
                metadata_lsu_silva_path, ConfigMetadata.GTDB_LSU_SILVA_PREFIX,
                metadata_lsu_silva_summary_file)
            if genome_list_taxonomy:
                for c, v in genome_list_taxonomy:
                    try:
                        if "blast_subject_id" not in c:
                            v = float(v)
                        self.cur.execute(query_taxonomy, [AsIs(c), v])
                    except:
                        self.cur.execute(query_taxonomy, [AsIs(c), v])
                if lsu_query_id is not None:
                    genome_list_sequence = self._parse_sequence_file(
                        metadata_lsu_fna_silva_path,
                        ConfigMetadata.GTDB_LSU_SILVA_PREFIX, lsu_query_id)
                    for c, v in genome_list_sequence:
                        self.cur.execute(query_sequence, [AsIs(c), v])

            query_gene_ssu = "UPDATE metadata_genes SET ssu_count = %s WHERE id = {0}".format(
                db_genome_id)
            self.cur.execute(query_gene_ssu, (ssu_count, ))

            query_gene_lsu = "UPDATE metadata_genes SET lsu_23s_count = %s WHERE id = {0}".format(
                db_genome_id)
            self.cur.execute(query_gene_lsu, (lsu_count, ))

            return True
        except psycopg2.Error as e:
            print "error"
            raise GenomeDatabaseError(e.pgerror)
        except:
            print("Unexpected error:", sys.exc_info()[0])
            raise