Exemplo n.º 1
0
    def _workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            gene_file = queueIn.get(block=True, timeout=None)
            if gene_file is None:
                break

            genome_dir, filename = os.path.split(gene_file)
            output_hit_file = os.path.join(
                genome_dir,
                filename.replace(self.protein_file_suffix, self.pfam_suffix))

            cmd = 'pfam_search.pl -outfile %s -cpu %d -fasta %s -dir %s' % (
                output_hit_file, self.cpus_per_genome, gene_file,
                self.pfam_hmm_dir)
            os.system(cmd)

            # calculate checksum
            checksum = sha256(output_hit_file)
            fout = open(output_hit_file + self.checksum_suffix, 'w')
            fout.write(checksum)
            fout.close()

            # identify top hit for each gene
            self._topHit(output_hit_file)

            queueOut.put(gene_file)
Exemplo n.º 2
0
  def __workerThread(self, queueIn, queueOut):
    """Process each data item in parallel."""
    while True:
      gene_file = queueIn.get(block=True, timeout=None)
      if gene_file == None:
        break

      assembly_dir, filename = os.path.split(gene_file)

      running_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_pfam.running'))
      if not os.path.exists(running_file):
        fout = open(running_file, 'w')
        fout.write('running')
        fout.close()

        output_hit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_pfam.tsv'))
        if not os.path.exists(output_hit_file):
          cmd = 'pfam_search.pl -outfile %s -cpu 1 -fasta %s -dir %s' % (output_hit_file, gene_file, self.pfam_hmm_dir)
          os.system(cmd)

          # calculate checksum
          checksum = sha256(output_hit_file)
          fout = open(output_hit_file + '.sha256', 'w')
          fout.write(checksum)
          fout.close()

        if os.path.exists(running_file):
          os.remove(running_file)

      queueOut.put(gene_file)
Exemplo n.º 3
0
    def _workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            gene_file = queueIn.get(block=True, timeout=None)
            if gene_file is None:
                break

            assembly_dir, filename = os.path.split(gene_file)
            output_hit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_suffix,
                                                                          self.tigrfam_suffix))

            hmmsearch_out = os.path.join(assembly_dir, filename.replace(self.protein_file_suffix, '_tigrfam.out'))
            cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu %d %s %s' % (hmmsearch_out,
                                                                                             output_hit_file,
                                                                                             self.cpus_per_genome,
                                                                                             self.tigrfam_hmms,
                                                                                             gene_file)
            os.system(cmd)

            # calculate checksum
            checksum = sha256(output_hit_file)
            fout = open(output_hit_file + self.checksum_suffix, 'w')
            fout.write(checksum)
            fout.close()

            # identify top hit for each gene
            self._topHit(output_hit_file)

            # allow results to be processed or written to file
            queueOut.put(gene_file)
Exemplo n.º 4
0
    def _tigr_top_hit(self, tigrfam_file, tigrfam_tophit_file):
        """Identify top TIGRfam hits."""

        tophits = {}
        for line in open(tigrfam_file):
            if line[0] == '#' or line[0] == '[':
                continue

            line_split = line.split()
            gene_id = line_split[0]
            hmm_id = line_split[3]
            evalue = float(line_split[4])
            bitscore = float(line_split[5])
            if gene_id in tophits:
                if bitscore > tophits[gene_id][2]:
                    tophits[gene_id] = (hmm_id, evalue, bitscore)
            else:
                tophits[gene_id] = (hmm_id, evalue, bitscore)

        fout = open(tigrfam_tophit_file, 'w')
        fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n')
        for gene_id, stats in tophits.iteritems():
            hit_str = ','.join(map(str, stats))
            fout.write('%s\t%s\n' % (gene_id, hit_str))
        fout.close()

        # calculate checksum
        checksum = sha256(tigrfam_tophit_file)
        fout = open(tigrfam_tophit_file + '.sha256', 'w')
        fout.write(checksum)
        fout.close()
Exemplo n.º 5
0
    def __tigr_worker(self, queue_in, queue_out):
        """Process each data item in parallel."""
        while True:
            gene_file = queue_in.get(block=True, timeout=None)
            if gene_file == None:
                break

            assembly_dir, filename = os.path.split(gene_file)

            output_hit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.tsv'))
            hmmsearch_out = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.out'))
            cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu 1 %s %s' % (hmmsearch_out, output_hit_file, self.tigrfam_hmms, gene_file)
            os.system(cmd)

            # calculate checksum
            checksum = sha256(output_hit_file)
            fout = open(output_hit_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()
            
            # determine top hits
            tigrfam_tophit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam_tophit.tsv'))
            self._tigr_top_hit(output_hit_file, tigrfam_tophit_file)

            # allow results to be processed or written to file
            queue_out.put(gene_file)
Exemplo n.º 6
0
  def __workerThread(self, queueIn, queueOut):
    """Process each data item in parallel."""
    while True:
      gene_file = queueIn.get(block=True, timeout=None)
      if gene_file == None:
        break

      assembly_dir, filename = os.path.split(gene_file)

      running_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, 'ko.running'))
      if not os.path.exists(running_file):
        fout = open(running_file, 'w')
        fout.write('running')
        fout.close()

        output_hit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_ko.tsv'))
        if not os.path.exists(output_hit_file):
            cmd = 'diamond blastp -k 100 -e 0.01 -p %d -d %s -q %s -k 1 -e 1e-3 -f %s -o %s' % (
                            1,
                            self.uniprot_ko_db,
                            gene_file,
                            '6 qseqid qlen sseqid stitle slen length pident evalue bitscore',
                            output_hit_file)

          # calculate checksum
          checksum = sha256(output_hit_file)
          fout = open(output_hit_file + '.sha256', 'w')
          fout.write(checksum)
          fout.close()

        if os.path.exists(running_file):
          os.remove(running_file)

      queueOut.put(gene_file)
Exemplo n.º 7
0
  def __workerThread(self, domain, queueIn, queueOut):
    """Process each data item in parallel."""
    while True:
      genome_file = queueIn.get(block=True, timeout=None)
      if genome_file == None:
        break

      assembly_dir, filename = os.path.split(genome_file)
      prefix = filename.replace(self.genome_file_ext, '')
      output_dir = os.path.join(assembly_dir, 'prokka')
      if os.path.exists(output_dir):
        queueOut.put(genome_file)
        continue
        
      os.makedirs(output_dir)
      prokka_out = os.path.join(output_dir, 'prokka.out')

      cmd = 'prokka --force --kingdom %s --prefix %s --outdir %s --cpus 1 %s 2> %s' % (domain, prefix, output_dir, genome_file, prokka_out)
      os.system(cmd)

      # calculate checksum
      prokka_gene_file = os.path.join(output_dir, prefix + '.faa')
      checksum = sha256(prokka_gene_file)
      fout = open(prokka_gene_file + '.sha256', 'w')
      fout.write(checksum)
      fout.close()
      
      # allow results to be processed or written to file
      queueOut.put(genome_file)
Exemplo n.º 8
0
    def _workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            gene_file = queueIn.get(block=True, timeout=None)
            if gene_file is None:
                break

            genome_dir, filename = os.path.split(gene_file)
            output_hit_file = os.path.join(genome_dir, filename.replace(self.protein_file_suffix,
                                                                        self.pfam_suffix))

            cmd = 'pfam_search.pl -outfile %s -cpu %d -fasta %s -dir %s' % (output_hit_file,
                                                                            self.cpus_per_genome,
                                                                            gene_file,
                                                                            self.pfam_hmm_dir)
            os.system(cmd)

            # calculate checksum
            checksum = sha256(output_hit_file)
            fout = open(output_hit_file + self.checksum_suffix, 'w')
            fout.write(checksum)
            fout.close()

            # identify top hit for each gene
            self._topHit(output_hit_file)

            queueOut.put(gene_file)
Exemplo n.º 9
0
    def __workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            gene_file = queueIn.get(block=True, timeout=None)
            if gene_file == None:
                break

            assembly_dir, filename = os.path.split(gene_file)

            running_file = os.path.join(
                assembly_dir,
                filename.replace(self.protein_file_ext, '_pfam.running'))
            if not os.path.exists(running_file):
                fout = open(running_file, 'w')
                fout.write('running')
                fout.close()

                output_hit_file = os.path.join(
                    assembly_dir,
                    filename.replace(self.protein_file_ext, '_pfam.tsv'))
                if not os.path.exists(output_hit_file):
                    cmd = 'pfam_search.pl -outfile %s -cpu 1 -fasta %s -dir %s' % (
                        output_hit_file, gene_file, self.pfam_hmm_dir)
                    os.system(cmd)

                    # calculate checksum
                    checksum = sha256(output_hit_file)
                    fout = open(output_hit_file + '.sha256', 'w')
                    fout.write(checksum)
                    fout.close()

                if os.path.exists(running_file):
                    os.remove(running_file)

            queueOut.put(gene_file)
Exemplo n.º 10
0
    def _workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        try:
            while True:
                gene_file = queueIn.get(block=True, timeout=None)
                if gene_file is None:
                    break

                genome_dir, filename = os.path.split(gene_file)
                genome_id = filename.replace(self.protein_file_suffix, '')
                output_hit_file = os.path.join(
                    self.output_dir, genome_id,
                    filename.replace(self.protein_file_suffix,
                                     self.pfam_suffix))
                dir_path = os.path.dirname(os.path.realpath(__file__))
                pfam_search_script = os.path.join(dir_path, 'pfam_search.pl')
                cmd = '%s -outfile %s -cpu %d -fasta %s -dir %s' % (
                    pfam_search_script, output_hit_file, self.cpus_per_genome,
                    gene_file, self.pfam_hmm_dir)
                osexitcode = os.system(cmd)
                if osexitcode == 1:
                    raise RuntimeError("Pfam_search has crashed")

                # calculate checksum
                checksum = sha256(output_hit_file)
                fout = open(output_hit_file + self.checksum_suffix, 'w')
                fout.write(checksum)
                fout.close()

                # identify top hit for each gene
                self._topHit(output_hit_file)

                queueOut.put(gene_file)
        except Exception as error:
            raise error
Exemplo n.º 11
0
  def __workerThread(self, queueIn, queueOut):
    """Process each data item in parallel."""
    while True:
      gene_file = queueIn.get(block=True, timeout=None)
      if gene_file == None:
        break

      assembly_dir, filename = os.path.split(gene_file)
      
      running_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.running'))
      if not os.path.exists(running_file):
        fout = open(running_file, 'w')
        fout.write('running')
        fout.close()
        
        output_hit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.tsv'))
        hmmsearch_out = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_tigrfam.out'))
        cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu 1 %s %s' % (hmmsearch_out, output_hit_file, self.tigrfam_hmms, gene_file)
        os.system(cmd)

        # calculate checksum
        checksum = sha256(output_hit_file)
        fout = open(output_hit_file + '.sha256', 'w')
        fout.write(checksum)
        fout.close()

        if os.path.exists(running_file):
            os.remove(running_file)

      # allow results to be processed or written to file
      queueOut.put(gene_file)
Exemplo n.º 12
0
    def __workerThread(self, domain, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            genome_file = queueIn.get(block=True, timeout=None)
            if genome_file == None:
                break

            assembly_dir, filename = os.path.split(genome_file)
            prefix = filename.replace(self.genome_file_ext, "")
            output_dir = os.path.join(assembly_dir, "prokka")
            if os.path.exists(output_dir):
                queueOut.put(genome_file)
                continue

            os.makedirs(output_dir)
            prokka_out = os.path.join(output_dir, "prokka.out")

            cmd = "prokka --force --kingdom %s --prefix %s --outdir %s --cpus 1 %s 2> %s" % (
                domain,
                prefix,
                output_dir,
                genome_file,
                prokka_out,
            )
            os.system(cmd)

            # calculate checksum
            prokka_gene_file = os.path.join(output_dir, prefix + ".faa")
            checksum = sha256(prokka_gene_file)
            fout = open(prokka_gene_file + ".sha256", "w")
            fout.write(checksum)
            fout.close()

            # allow results to be processed or written to file
            queueOut.put(genome_file)
Exemplo n.º 13
0
    def __pfam_worker(self, queue_in, queue_out):
        """Process each data item in parallel."""
        while True:
            gene_file = queue_in.get(block=True, timeout=None)
            if gene_file == None:
                break

            assembly_dir, filename = os.path.split(gene_file)

            output_hit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_pfam.tsv'))
            cmd = 'pfam_search.pl -outfile %s -cpu 1 -fasta %s -dir %s' % (output_hit_file, gene_file, self.pfam_hmm_dir)
            os.system(cmd)

            # calculate checksum
            checksum = sha256(output_hit_file)
            fout = open(output_hit_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()
            
            # determine top hits
            pfam_tophit_file = os.path.join(assembly_dir, filename.replace(self.protein_file_ext, '_pfam_tophit.tsv'))
            self._pfam_top_hit(output_hit_file, pfam_tophit_file)

            # allow results to be processed or written to file
            queue_out.put(gene_file)
Exemplo n.º 14
0
    def _workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            gene_file = queueIn.get(block=True, timeout=None)
            if gene_file is None:
                break

            assembly_dir, filename = os.path.split(gene_file)
            output_hit_file = os.path.join(
                assembly_dir,
                filename.replace(self.protein_file_suffix,
                                 self.tigrfam_suffix))

            hmmsearch_out = os.path.join(
                assembly_dir,
                filename.replace(self.protein_file_suffix, '_tigrfam.out'))
            cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu %d %s %s' % (
                hmmsearch_out, output_hit_file, self.cpus_per_genome,
                self.tigrfam_hmms, gene_file)
            os.system(cmd)

            # calculate checksum
            checksum = sha256(output_hit_file)
            fout = open(output_hit_file + self.checksum_suffix, 'w')
            fout.write(checksum)
            fout.close()

            # identify top hit for each gene
            self._topHit(output_hit_file)

            # allow results to be processed or written to file
            queueOut.put(gene_file)
Exemplo n.º 15
0
    def __pfam_worker(self, queue_in, queue_out):
        """Process each data item in parallel."""

        pfam_version = 'pfam_33.1'
        pfam_extension = f'_{pfam_version}.tsv'
        pfam_tophit_extension = f'_{pfam_version}_tophit.tsv'

        symlink_pfam_extension = '_pfam.tsv'
        symlink_pfam_tophit_extension = '_pfam_tophit.tsv'

        while True:
            gene_file = queue_in.get(block=True, timeout=None)
            if gene_file == None:
                break

            assembly_dir, filename = os.path.split(gene_file)
            make_sure_path_exists(os.path.join(assembly_dir, pfam_version))

            output_hit_file = os.path.join(
                assembly_dir, pfam_version,
                filename.replace(self.protein_file_ext, pfam_extension))
            cmd = 'pfam_search.pl -outfile %s -cpu 1 -fasta %s -dir %s' % (
                output_hit_file, gene_file, self.pfam_hmm_dir)
            os.system(cmd)
            # print(cmd)

            # calculate checksum
            checksum = sha256(output_hit_file)
            fout = open(output_hit_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()

            # determine top hits
            pfam_tophit_file = os.path.join(
                assembly_dir, pfam_version,
                filename.replace(self.protein_file_ext, pfam_tophit_extension))
            self._pfam_top_hit(output_hit_file, pfam_tophit_file)

            # create symlink in prodigal_folder
            new_hit_link = os.path.join(
                assembly_dir,
                filename.replace(self.protein_file_ext,
                                 symlink_pfam_extension))
            new_tophit_link = os.path.join(
                assembly_dir,
                filename.replace(self.protein_file_ext,
                                 symlink_pfam_tophit_extension))

            #==================================================================
            # print(f'{new_hit_link} will point to {output_hit_file}')
            # print(f'{new_tophit_link} will point to {pfam_tophit_file}')
            #==================================================================

            os.symlink(output_hit_file, new_hit_link)
            os.symlink(pfam_tophit_file, new_tophit_link)

            # allow results to be processed or written to file
            queue_out.put(gene_file)
Exemplo n.º 16
0
    def moveGenomes(self, db_genome_ids):
        """Move genome files into database directory structure.

        This function assumes addGenomes() has been called. It is
        not directly called by addGenomes() as all database
        queries are performed before moving genomes.

        Parameters
        ----------
        db_genome_ids : list
            Unique database identifiers for genomes.
        """

        assert(self.tmp_output_dir)

        # get database genome identifiers
        self.cur.execute("SELECT genomes.id,user_editable, external_id_prefix || '_' || id_at_source as external_id " +
                         "FROM genomes, genome_sources " +
                         "WHERE genome_source_id = genome_sources.id " +
                         "AND genomes.id in %s", (tuple(db_genome_ids),))

        external_id_dict = {}
        for (genome_id, user_editable, external_id) in self.cur:
            if user_editable:
                external_id_dict[genome_id] = external_id

        if len(external_id_dict.keys()) > 0:
            username = None
            if self.currentUser.isRootUser():
                username = self.currentUser.getElevatedFromUsername()
            else:
                username = self.currentUser.getUsername()

            if username is None:
                raise GenomeDatabaseError(
                    "Unable to determine user to add genomes under.")

        gtdb_target_dir = os.path.join(self.genomeCopyDir, username)
        for db_genome_id, external_id in external_id_dict.items():
            tmp_genome_dir = os.path.join(self.tmp_output_dir, external_id)

            genome_target_dir = os.path.join(gtdb_target_dir, external_id)
            if os.path.exists(genome_target_dir):
                raise GenomeDatabaseError(
                    "Genome directory already exists: %s" % genome_target_dir)

            shutil.move(tmp_genome_dir, genome_target_dir)

            self.cur.execute("UPDATE genomes SET fasta_file_location = %s , genes_file_location = %s , genes_file_sha256 = %s WHERE id = %s", (
                os.path.join(
                    username, external_id, external_id + self.genomeFileSuffix),
                os.path.join(
                    username, external_id, self.userAnnotationDir, external_id + self.proteinFileSuffix),
                sha256(os.path.join(genome_target_dir, self.userAnnotationDir, external_id + self.proteinFileSuffix)),
                db_genome_id))

        shutil.rmtree(self.tmp_output_dir)
Exemplo n.º 17
0
    def run(self, genome_dir, threads):
        # get path to all unprocessed TIGRfam HMM result files
        print 'Reading TIGRfam HMM files.'
        tigrfam_files = []
        for genome_id in os.listdir(genome_dir):
            cur_genome_dir = os.path.join(genome_dir, genome_id)
            if os.path.isdir(cur_genome_dir):
                for assembly_id in os.listdir(cur_genome_dir):
                    assembly_dir = os.path.join(cur_genome_dir, assembly_id)
                    groups = assembly_id.split('_')
                    processed_assembly_id = '_'.join(groups[:2])
                    tigrfam_tophit_file = os.path.join(assembly_dir, 'prodigal', processed_assembly_id + '_tigrfam_tophit.tsv')
                    if os.path.exists(tigrfam_tophit_file):
                        # verify checksum
                        checksum_file = tigrfam_tophit_file + '.sha256'
                        if os.path.exists(checksum_file):
                            checksum = sha256(tigrfam_tophit_file)
                            cur_checksum = open(checksum_file).readline().strip()
                            if checksum == cur_checksum:
                                continue

                    tigrfam_file = os.path.join(assembly_dir, 'prodigal', processed_assembly_id + self.tigrfam_ext)
                    if os.path.exists(tigrfam_file):
                        tigrfam_files.append(tigrfam_file)

        print '  Number of unprocessed genomes: %d' % len(tigrfam_files)

        # populate worker queue with data to process
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for f in tigrfam_files:
            workerQueue.put(f)

        for _ in range(threads):
            workerQueue.put(None)

        try:
            workerProc = [mp.Process(target=self.__workerThread, args=(workerQueue, writerQueue)) for _ in range(threads)]
            writeProc = mp.Process(target=self.__writerThread, args=(len(tigrfam_files), writerQueue))

            writeProc.start()

            for p in workerProc:
                p.start()

            for p in workerProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            for p in workerProc:
                p.terminate()

            writeProc.terminate()
Exemplo n.º 18
0
    def run(self, genome_dir, threads):
        # get path to all unprocessed Pfam HMM result files
        print 'Reading Pfam HMM files.'
        pfam_files = []
        for genome_id in os.listdir(genome_dir):
            cur_genome_dir = os.path.join(genome_dir, genome_id)
            if os.path.isdir(cur_genome_dir):
                for assembly_id in os.listdir(cur_genome_dir):
                    assembly_dir = os.path.join(cur_genome_dir, assembly_id)
                    groups = assembly_id.split('_')
                    processed_assembly_id = '_'.join(groups[:2])
                    pfam_tophit_file = os.path.join(assembly_dir, 'prodigal', processed_assembly_id + '_pfam_tophit.tsv')
                    if os.path.exists(pfam_tophit_file):
                        # verify checksum
                        checksum_file = pfam_tophit_file + '.sha256'
                        if os.path.exists(checksum_file):
                            checksum = sha256(pfam_tophit_file)
                            cur_checksum = open(checksum_file).readline().strip()
                            if checksum == cur_checksum:
                                continue

                    pfam_file = os.path.join(assembly_dir, 'prodigal', processed_assembly_id + self.pfam_ext)
                    if os.path.exists(pfam_file):
                        pfam_files.append(pfam_file)

        print '  Number of unprocessed genomes: %d' % len(pfam_files)

        # populate worker queue with data to process
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for f in pfam_files:
            workerQueue.put(f)

        for _ in range(threads):
            workerQueue.put(None)

        try:
            workerProc = [mp.Process(target=self.__workerThread, args=(workerQueue, writerQueue)) for _ in range(threads)]
            writeProc = mp.Process(target=self.__writerThread, args=(len(pfam_files), writerQueue))

            writeProc.start()

            for p in workerProc:
                p.start()

            for p in workerProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            for p in workerProc:
                p.terminate()

            writeProc.terminate()
Exemplo n.º 19
0
    def _topHit(self, pfam_file):
        """Determine top hits to PFAMs.

        A gene may be assigned to multiple
        PFAM families from the same clan. The
        search_pfam.pl script takes care of
        most of these issues and here the results
        are simply parsed.

        Parameters
        ----------
        tigrfam_file : str
            Name of file containing hits to TIGRFAM HMMs.
        """

        assembly_dir, filename = os.path.split(pfam_file)
        genome_id = filename.replace(self.pfam_suffix, '')
        output_tophit_file = os.path.join(
            self.output_dir, genome_id,
            filename.replace(self.pfam_suffix, self.pfam_top_hit_suffix))

        tophits = defaultdict(dict)
        for line in open(pfam_file):
            if line[0] == '#' or not line.strip():
                continue

            line_split = line.split()
            gene_id = line_split[0]
            hmm_id = line_split[5]
            evalue = float(line_split[12])
            bitscore = float(line_split[11])
            if gene_id in tophits:
                if hmm_id in tophits[gene_id]:
                    if bitscore > tophits[gene_id][hmm_id][1]:
                        tophits[gene_id][hmm_id] = (evalue, bitscore)
                else:
                    tophits[gene_id][hmm_id] = (evalue, bitscore)
            else:
                tophits[gene_id][hmm_id] = (evalue, bitscore)

        fout = open(output_tophit_file, 'w')
        fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n')
        for gene_id, hits in tophits.iteritems():
            hit_str = []
            for hmm_id, stats in hits.iteritems():
                hit_str.append(hmm_id + ',' + ','.join(map(str, stats)))
            fout.write('%s\t%s\n' % (gene_id, ';'.join(hit_str)))
        fout.close()

        # calculate checksum
        checksum = sha256(output_tophit_file)
        fout = open(output_tophit_file + self.checksum_suffix, 'w')
        fout.write(checksum)
        fout.close()
Exemplo n.º 20
0
    def _topHit(self, pfam_file):
        """Determine top hits to PFAMs.

        A gene may be assigned to multiple
        PFAM families from the same clan. The
        search_pfam.pl script takes care of
        most of these issues and here the results
        are simply parsed.

        Parameters
        ----------
        tigrfam_file : str
            Name of file containing hits to TIGRFAM HMMs.
        """

        assembly_dir, filename = os.path.split(pfam_file)
        output_tophit_file = os.path.join(assembly_dir, filename.replace(self.pfam_suffix,
                                                                         self.pfam_top_hit_suffix))

        tophits = defaultdict(dict)
        for line in open(pfam_file):
            if line[0] == '#' or not line.strip():
                continue

            line_split = line.split()
            gene_id = line_split[0]
            hmm_id = line_split[5]
            evalue = float(line_split[12])
            bitscore = float(line_split[11])
            if gene_id in tophits:
                if hmm_id in tophits[gene_id]:
                    if bitscore > tophits[gene_id][hmm_id][1]:
                        tophits[gene_id][hmm_id] = (evalue, bitscore)
                else:
                    tophits[gene_id][hmm_id] = (evalue, bitscore)
            else:
                tophits[gene_id][hmm_id] = (evalue, bitscore)

        fout = open(output_tophit_file, 'w')
        fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n')
        for gene_id, hits in tophits.iteritems():
            hit_str = []
            for hmm_id, stats in hits.iteritems():
                hit_str.append(hmm_id + ',' + ','.join(map(str, stats)))
            fout.write('%s\t%s\n' % (gene_id, ';'.join(hit_str)))
        fout.close()

        # calculate checksum
        checksum = sha256(output_tophit_file)
        fout = open(output_tophit_file + self.checksum_suffix, 'w')
        fout.write(checksum)
        fout.close()
Exemplo n.º 21
0
    def _runProdigal(self, fasta_path):
        """Run Prodigal.

        Parameters
        ----------
        fasta_path : str
            Path to FASTA file to process.
        """

        temp_dir, fasta_file = os.path.split(fasta_path)
        output_dir = os.path.join(temp_dir, self.userAnnotationDir)
        genome_id = fasta_file[0:fasta_file.rfind('_')]

        prodigal = BioLibProdigal(1, False)
        summary_stats = prodigal.run([fasta_path], output_dir)
        summary_stats = summary_stats[summary_stats.keys()[0]]

        # rename output files to adhere to GTDB conventions
        aa_gene_file = os.path.join(
            output_dir, genome_id + ConfigMetadata.PROTEIN_FILE_SUFFIX)
        shutil.move(summary_stats.aa_gene_file, aa_gene_file)

        nt_gene_file = os.path.join(
            output_dir, genome_id + ConfigMetadata.NT_GENE_FILE_SUFFIX)
        shutil.move(summary_stats.nt_gene_file, nt_gene_file)

        gff_file = os.path.join(output_dir,
                                genome_id + ConfigMetadata.GFF_FILE_SUFFIX)
        shutil.move(summary_stats.gff_file, gff_file)

        # save translation table information
        translation_table_file = os.path.join(
            output_dir, 'prodigal_translation_table.tsv')
        fout = open(translation_table_file, 'w')
        fout.write(
            '%s\t%d\n' %
            ('best_translation_table', summary_stats.best_translation_table))
        fout.write('%s\t%.2f\n' %
                   ('coding_density_4', summary_stats.coding_density_4 * 100))
        fout.write(
            '%s\t%.2f\n' %
            ('coding_density_11', summary_stats.coding_density_11 * 100))
        fout.close()

        checksum = sha256(aa_gene_file)
        fout = open(aa_gene_file + ConfigMetadata.CHECKSUM_SUFFIX, 'w')
        fout.write(checksum)
        fout.close()

        return (aa_gene_file, nt_gene_file, gff_file, translation_table_file)
Exemplo n.º 22
0
    def __workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            pfam_file = queueIn.get(block=True, timeout=None)
            if pfam_file is None:
                break

            assembly_dir, filename = os.path.split(pfam_file)
            output_tophit_file = os.path.join(
                assembly_dir,
                filename.replace(self.pfam_ext, '_pfam_tophit.tsv'))

            tophits = defaultdict(dict)
            for line in open(pfam_file):
                if line[0] == '#' or not line.strip():
                    continue

                line_split = line.split()
                gene_id = line_split[0]
                hmm_id = line_split[5]
                evalue = float(line_split[12])
                bitscore = float(line_split[11])
                if gene_id in tophits:
                    if hmm_id in tophits[gene_id]:
                        if bitscore > tophits[gene_id][hmm_id][1]:
                            tophits[gene_id][hmm_id] = (evalue, bitscore)
                    else:
                        tophits[gene_id][hmm_id] = (evalue, bitscore)
                else:
                    tophits[gene_id][hmm_id] = (evalue, bitscore)

            fout = open(output_tophit_file, 'w')
            fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n')
            for gene_id, hits in tophits.iteritems():
                hit_str = []
                for hmm_id, stats in hits.iteritems():
                    hit_str.append(hmm_id + ',' + ','.join(map(str, stats)))
                fout.write('%s\t%s\n' % (gene_id, ';'.join(hit_str)))
            fout.close()

            # calculate checksum
            checksum = sha256(output_tophit_file)
            fout = open(output_tophit_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()

            # allow results to be processed or written to file
            queueOut.put(pfam_file)
Exemplo n.º 23
0
    def _topHit(self, tigrfam_file):
        """Determine top hits to TIGRFAMs.

        A gene is assigned to a single TIGRFAM
        family. This will be the top hit among
        all TIGRFAM HMMs and pass the threshold
        for the HMM.

        Parameters
        ----------
        tigrfam_file : str
            Name of file containing hits to TIGRFAM HMMs.
        """
        assembly_dir, filename = os.path.split(tigrfam_file)
        genome_id = filename.replace(self.tigrfam_suffix, '')
        output_tophit_file = os.path.join(
            self.output_dir, genome_id,
            filename.replace(self.tigrfam_suffix, self.tigrfam_top_hit_suffix))

        tophits = {}
        for line in open(tigrfam_file):
            if line[0] == '#':
                continue

            line_split = line.split()
            gene_id = line_split[0]
            hmm_id = line_split[3]
            evalue = float(line_split[4])
            bitscore = float(line_split[5])
            if gene_id in tophits:
                if bitscore > tophits[gene_id][2]:
                    tophits[gene_id] = (hmm_id, evalue, bitscore)
            else:
                tophits[gene_id] = (hmm_id, evalue, bitscore)

        fout = open(output_tophit_file, 'w')
        fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n')
        for gene_id, stats in tophits.iteritems():
            hit_str = ','.join(map(str, stats))
            fout.write('%s\t%s\n' % (gene_id, hit_str))
        fout.close()

        # calculate checksum
        checksum = sha256(output_tophit_file)
        fout = open(output_tophit_file + self.checksum_suffix, 'w')
        fout.write(checksum)
        fout.close()
Exemplo n.º 24
0
    def __workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            pfam_file = queueIn.get(block=True, timeout=None)
            if pfam_file is None:
                break

            assembly_dir, filename = os.path.split(pfam_file)
            output_tophit_file = os.path.join(assembly_dir, filename.replace(self.pfam_ext, '_pfam_tophit.tsv'))

            tophits = defaultdict(dict)
            for line in open(pfam_file):
                if line[0] == '#' or not line.strip():
                    continue

                line_split = line.split()
                gene_id = line_split[0]
                hmm_id = line_split[5]
                evalue = float(line_split[12])
                bitscore = float(line_split[11])
                if gene_id in tophits:
                    if hmm_id in tophits[gene_id]:
                        if bitscore > tophits[gene_id][hmm_id][1]:
                            tophits[gene_id][hmm_id] = (evalue, bitscore)
                    else:
                        tophits[gene_id][hmm_id] = (evalue, bitscore)
                else:
                    tophits[gene_id][hmm_id] = (evalue, bitscore)

            fout = open(output_tophit_file, 'w')
            fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n')
            for gene_id, hits in tophits.iteritems():
                hit_str = []
                for hmm_id, stats in hits.iteritems():
                    hit_str.append(hmm_id + ',' + ','.join(map(str, stats)))
                fout.write('%s\t%s\n' % (gene_id, ';'.join(hit_str)))
            fout.close()

            # calculate checksum
            checksum = sha256(output_tophit_file)
            fout = open(output_tophit_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()

            # allow results to be processed or written to file
            queueOut.put(pfam_file)
Exemplo n.º 25
0
    def _topHit(self, tigrfam_file):
        """Determine top hits to TIGRFAMs.

        A gene is assigned to a single TIGRFAM
        family. This will be the top hit among
        all TIGRFAM HMMs and pass the threshold
        for the HMM.

        Parameters
        ----------
        tigrfam_file : str
            Name of file containing hits to TIGRFAM HMMs.
        """
        assembly_dir, filename = os.path.split(tigrfam_file)
        output_tophit_file = os.path.join(assembly_dir, filename.replace(self.tigrfam_suffix,
                                                                         self.tigrfam_top_hit_suffix))

        tophits = {}
        for line in open(tigrfam_file):
            if line[0] == '#':
                continue

            line_split = line.split()
            gene_id = line_split[0]
            hmm_id = line_split[3]
            evalue = float(line_split[4])
            bitscore = float(line_split[5])
            if gene_id in tophits:
                if bitscore > tophits[gene_id][2]:
                    tophits[gene_id] = (hmm_id, evalue, bitscore)
            else:
                tophits[gene_id] = (hmm_id, evalue, bitscore)

        fout = open(output_tophit_file, 'w')
        fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n')
        for gene_id, stats in tophits.iteritems():
            hit_str = ','.join(map(str, stats))
            fout.write('%s\t%s\n' % (gene_id, hit_str))
        fout.close()

        # calculate checksum
        checksum = sha256(output_tophit_file)
        fout = open(output_tophit_file + self.checksum_suffix, 'w')
        fout.write(checksum)
        fout.close()
Exemplo n.º 26
0
    def __workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            genome_file = queueIn.get(block=True, timeout=None)
            if genome_file == None:
                break

            assembly_dir, filename = os.path.split(genome_file)
            trna_dir = os.path.join(assembly_dir, 'trna')
            genome_id = filename[0:filename.find('_', 4)]

            if not os.path.exists(trna_dir):
                os.makedirs(trna_dir)

            output_file = os.path.join(trna_dir, genome_id + '_trna.tsv')
            log_file = os.path.join(trna_dir, genome_id + '_trna.log')
            stats_file = os.path.join(trna_dir, genome_id + '_trna_stats.tsv')

            domain_flag = '-B'
            if self.domain_dict.get(genome_id) == 'Archaea':
                domain_flag = '-A'

            #cmd = 'tRNAscan-SE %s -q -Q -o %s -m %s -l %s %s' % (domain_flag, output_file, stats_file, log_file, genome_file)
            # os.system(cmd)

            cmd_to_run = [
                'tRNAscan-SE', domain_flag, '-q', '-Q', '-o', output_file,
                '-m', stats_file, '-l', log_file, genome_file
            ]
            proc = subprocess.Popen(cmd_to_run,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            stdout, stderr = proc.communicate()
            # print proc.returncode
            if proc.returncode != 0:
                raise RuntimeError(
                    "%r failed, status code %s stdout %r stderr %r" %
                    (cmd_to_run, proc.returncode, stdout, stderr))
            checksum_file = open(output_file + '.sha256', 'w')
            checksum_file.write('{}\n'.format(sha256(output_file)))
            checksum_file.close()

            queueOut.put(genome_file)
Exemplo n.º 27
0
    def __workerThread(self, queueIn, queueOut):
        """Process each data item in parallel."""
        while True:
            gene_file = queueIn.get(block=True, timeout=None)
            if gene_file == None:
                break

            assembly_dir, filename = os.path.split(gene_file)

            running_file = os.path.join(
                assembly_dir,
                filename.replace(self.protein_file_ext, '_tigrfam.running'))
            if not os.path.exists(running_file):
                fout = open(running_file, 'w')
                fout.write('running')
                fout.close()

                output_hit_file = os.path.join(
                    assembly_dir,
                    filename.replace(self.protein_file_ext, '_tigrfam.tsv'))
                hmmsearch_out = os.path.join(
                    assembly_dir,
                    filename.replace(self.protein_file_ext, '_tigrfam.out'))
                cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu 1 %s %s' % (
                    hmmsearch_out, output_hit_file, self.tigrfam_hmms,
                    gene_file)
                os.system(cmd)

                # calculate checksum
                checksum = sha256(output_hit_file)
                fout = open(output_hit_file + '.sha256', 'w')
                fout.write(checksum)
                fout.close()

                if os.path.exists(running_file):
                    os.remove(running_file)

            # allow results to be processed or written to file
            queueOut.put(gene_file)
Exemplo n.º 28
0
    def _pfam_top_hit(self, pfam_file, pfam_tophit_file):
        """Identify top Pfam hits."""
        
        tophits = defaultdict(dict)
        for line in open(pfam_file):
            if line[0] == '#' or not line.strip():
                continue

            line_split = line.split()
            gene_id = line_split[0]
            hmm_id = line_split[5]
            evalue = float(line_split[12])
            bitscore = float(line_split[11])
            if gene_id in tophits:
                if hmm_id in tophits[gene_id]:
                    if bitscore > tophits[gene_id][hmm_id][1]:
                        tophits[gene_id][hmm_id] = (evalue, bitscore)
                else:
                    tophits[gene_id][hmm_id] = (evalue, bitscore)
            else:
                tophits[gene_id][hmm_id] = (evalue, bitscore)

        fout = open(pfam_tophit_file, 'w')
        fout.write('Gene Id\tTop hits (Family id,e-value,bitscore)\n')
        for gene_id, hits in tophits.iteritems():
            hit_str = []
            for hmm_id, stats in hits.iteritems():
                hit_str.append(hmm_id + ',' + ','.join(map(str, stats)))
            fout.write('%s\t%s\n' % (gene_id, ';'.join(hit_str)))
        fout.close()

        # calculate checksum
        checksum = sha256(pfam_tophit_file)
        fout = open(pfam_tophit_file + '.sha256', 'w')
        fout.write(checksum)
        fout.close()
Exemplo n.º 29
0
  def run(self, genome_dir, domain, genome_list, threads):
    genomes_to_process = None
    if genome_list:
        genomes_to_process = set()
        for line in open(genome_list):
            line_split = line.strip().split('\t')
            genome_id = line_split[0]
            if genome_id.startswith('GB_') or genome_id.startswith('RS_'):
                genome_id = genome_id[3:]
                
            genomes_to_process.add(genome_id)
  
    # get path to all unprocessed genome gene files
    print 'Reading genomes.'
    genome_files = []
    for genome_id in os.listdir(genome_dir):
      cur_genome_dir = os.path.join(genome_dir, genome_id)
      if os.path.isdir(cur_genome_dir):
        for assembly_id in os.listdir(cur_genome_dir):
          assembly_dir = os.path.join(cur_genome_dir, assembly_id)
          
          genome_id = assembly_id[0:assembly_id.find('_', 4)]
          if genomes_to_process and genome_id not in genomes_to_process:
            continue

          prokka_dir = os.path.join(assembly_dir, 'prokka')
          if os.path.exists(prokka_dir):
            continue
            
          prokka_file = os.path.join(prokka_dir, assembly_id + '.faa')
          if os.path.exists(prokka_file):
            # verify checksum
            checksum_file = prokka_file + '.sha256'
            if os.path.exists(checksum_file):
              checksum = sha256(prokka_file)
              cur_checksum = open(checksum_file).readline().strip()
              if checksum == cur_checksum:
                continue

          genome_file = os.path.join(assembly_dir, assembly_id + self.genome_file_ext)
          if os.path.exists(genome_file):
            genome_files.append(genome_file)

    print '  Number of unprocessed genomes: %d\n' % len(genome_files)

    # populate worker queue with data to process
    workerQueue = mp.Queue()
    writerQueue = mp.Queue()

    for f in genome_files:
      workerQueue.put(f)

    for _ in range(threads):
      workerQueue.put(None)

    try:
      workerProc = [mp.Process(target = self.__workerThread, args = (domain, workerQueue, writerQueue)) for _ in range(threads)]
      writeProc = mp.Process(target = self.__writerThread, args = (len(genome_files), writerQueue))

      writeProc.start()

      for p in workerProc:
        p.start()

      for p in workerProc:
        p.join()

      writerQueue.put(None)
      writeProc.join()
    except:
      for p in workerProc:
        p.terminate()

      writeProc.terminate()
Exemplo n.º 30
0
    def run(self, input_dir, threads):
        # get path to all unprocessed Pfam HMM result files
        print 'Reading Pfam HMM files.'
        pfam_files = []
        for first_three in os.listdir(input_dir):
            onethird_species_dir = os.path.join(input_dir, first_three)
            print onethird_species_dir
            if os.path.isfile(onethird_species_dir):
                continue
            for second_three in os.listdir(onethird_species_dir):
                twothird_species_dir = os.path.join(onethird_species_dir,
                                                    second_three)
                # print twothird_species_dir
                if os.path.isfile(twothird_species_dir):
                    continue
                for third_three in os.listdir(twothird_species_dir):
                    threethird_species_dir = os.path.join(
                        twothird_species_dir, third_three)
                    # print threethird_species_dir
                    if os.path.isfile(threethird_species_dir):
                        continue
                    for complete_name in os.listdir(threethird_species_dir):
                        assembly_dir = os.path.join(threethird_species_dir,
                                                    complete_name)
                        if os.path.isfile(assembly_dir):
                            continue
                        groups = complete_name.split('_')
                        processed_assembly_id = '_'.join(groups[:2])
                        pfam_tophit_file = os.path.join(
                            assembly_dir, 'prodigal',
                            processed_assembly_id + '_pfam_tophit.tsv')
                        if os.path.exists(pfam_tophit_file):
                            # verify checksum
                            checksum_file = pfam_tophit_file + '.sha256'
                            if os.path.exists(checksum_file):
                                checksum = sha256(pfam_tophit_file)
                                cur_checksum = open(
                                    checksum_file).readline().strip()
                                if checksum == cur_checksum:
                                    continue

                        pfam_file = os.path.join(
                            assembly_dir, 'prodigal',
                            processed_assembly_id + self.pfam_ext)
                        if os.path.exists(pfam_file):
                            pfam_files.append(pfam_file)

        print '  Number of unprocessed genomes: %d' % len(pfam_files)
        # populate worker queue with data to process
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for f in pfam_files:
            workerQueue.put(f)

        for _ in range(threads):
            workerQueue.put(None)

        try:
            workerProc = [
                mp.Process(target=self.__workerThread,
                           args=(workerQueue, writerQueue))
                for _ in range(threads)
            ]
            writeProc = mp.Process(target=self.__writerThread,
                                   args=(len(pfam_files), writerQueue))

            writeProc.start()

            for p in workerProc:
                p.start()

            for p in workerProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            for p in workerProc:
                p.terminate()

            writeProc.terminate()
Exemplo n.º 31
0
    def _addGenomeToDB(self, fasta_file_path, name, desc, source, id_at_source,
                       gene_path):
        """Add genome to database.

        Parameters
        ----------
        fasta_file_path : str
            Path to genome FASTA file with nucleotide sequences.
        name : str
            Desired name of genome.
        desc : str
            Description of genome.
        source : str
            Source of genome.
        id_at_source : int
            ?
        gene_path : str
            Path to called genes in amino acid space.

        Returns
        -------
        str
            Database identifier of genome.
        """
        try:
            fasta_sha256_checksum = sha256(fasta_file_path)

            gene_sha256_checksum = None
            if gene_path is not None:
                gene_sha256_checksum = sha256(gene_path)
            if source is None:
                source = self.defaultGenomeSourceName

            self.cur.execute(
                "SELECT id, external_id_prefix, user_editable FROM genome_sources WHERE name = %s",
                (source, ))
            source_id = None

            for (db_id, _external_id_prefix, user_editable) in self.cur:
                if (not user_editable):
                    if id_at_source is None:
                        raise GenomeDatabaseError(
                            "Cannot auto generate ids at source for the %s genome source."
                            % source)
                    if (not self.currentUser.isRootUser()):
                        raise GenomeDatabaseError(
                            "Only the root user can add genomes to the %s genome source."
                            % source)
                source_id = db_id
                break

            if source_id is None:
                raise GenomeDatabaseError(
                    "Could not find the %s genome source." % source)

            if id_at_source is None:
                # We use update to return a value. This update should fix the concurreny of multit thread using the same value. Update locks the cell during the transaction.
                self.cur.execute("SELECT update_last_auto(%s);", (source_id, ))
                id_at_source = str(self.cur.fetchone()[0])

            added = datetime.datetime.now()

            owner_id = None
            if not self.currentUser.isRootUser():
                owner_id = self.currentUser.getUserId()

            self.cur.execute(
                "SELECT id FROM genomes WHERE genome_source_id = %s AND id_at_source = %s",
                (source_id, id_at_source))

            result = self.cur.fetchall()

            columns = "(name, description, owned_by_root, owner_id, fasta_file_location, " + \
                "fasta_file_sha256, genes_file_location, genes_file_sha256,genome_source_id, id_at_source, date_added)"

            if len(result):
                raise GenomeDatabaseError(
                    "Genome source '%s' already contains id '%s'. Use -f to force an overwrite."
                    % (source, id_at_source))

            self.cur.execute(
                "INSERT INTO genomes " + columns + " "
                "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) " +
                "RETURNING id",
                (name, desc, self.currentUser.isRootUser(), owner_id,
                 fasta_file_path, fasta_sha256_checksum, gene_path,
                 gene_sha256_checksum, source_id, id_at_source, added))
            (db_genome_id, ) = self.cur.fetchone()

            return db_genome_id

        except GenomeDatabaseError as e:
            raise e
Exemplo n.º 32
0
    def run(self, input_dir, tmp_dir, threads):
        # get path to all unprocessed genome files
        print 'Reading genomes.'
        genome_files = []
        for genome_dir in os.listdir(input_dir):
            cur_genome_dir = os.path.join(input_dir, genome_dir)
            if not os.path.isdir(cur_genome_dir):
                continue

            for assembly_id in os.listdir(cur_genome_dir):
                assembly_dir = os.path.join(cur_genome_dir, assembly_id)
                genome_id = assembly_id[0:assembly_id.find('_', 4)]

                # check if prodigal has already been called
                aa_gene_file = os.path.join(assembly_dir, 'prodigal',
                                            genome_id + '_protein.faa')
                if os.path.exists(aa_gene_file):
                    # verify checksum
                    checksum_file = aa_gene_file + '.sha256'
                    if os.path.exists(checksum_file):
                        checksum = sha256(aa_gene_file)
                        cur_checksum = open(checksum_file).readline().strip()
                        if checksum == cur_checksum:
                            continue

                genome_file = os.path.join(assembly_dir,
                                           assembly_id + '_genomic.fna')
                if os.path.exists(genome_file):
                    if os.stat(genome_file).st_size == 0:
                        print '[Warning] Genome file appears to be empty: %s' % genome_file
                    else:
                        genome_files.append(genome_file)

        print '  Number of unprocessed genomes: %d' % len(genome_files)

        # run prodigal on each genome
        print 'Running prodigal.'
        prodigal = Prodigal(cpus=threads)
        summary_stats = prodigal.run(genome_files, output_dir=tmp_dir)

        # move results into individual genome directories
        print 'Moving files and calculating checksums.'
        for genome_file in genome_files:
            genome_path, genome_id = ntpath.split(genome_file)
            genome_id = remove_extension(genome_id)

            aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa')
            nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna')
            gff_file = os.path.join(tmp_dir, genome_id + '.gff')

            genome_root = genome_id[0:genome_id.find('_', 4)]
            prodigal_path = os.path.join(genome_path, 'prodigal')
            if not os.path.exists(prodigal_path):
                os.makedirs(prodigal_path)
            new_aa_gene_file = os.path.join(prodigal_path,
                                            genome_root + '_protein.faa')
            new_nt_gene_file = os.path.join(prodigal_path,
                                            genome_root + '_protein.fna')
            new_gff_file = os.path.join(prodigal_path,
                                        genome_root + '_protein.gff')

            os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file))
            os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file))
            os.system('mv %s %s' % (gff_file, new_gff_file))

            # save translation table information
            translation_table_file = os.path.join(
                prodigal_path, 'prodigal_translation_table.tsv')
            fout = open(translation_table_file, 'w')
            fout.write('%s\t%d\n' %
                       ('best_translation_table',
                        summary_stats[genome_id].best_translation_table))
            fout.write('%s\t%.2f\n' %
                       ('coding_density_4',
                        summary_stats[genome_id].coding_density_4 * 100))
            fout.write('%s\t%.2f\n' %
                       ('coding_density_11',
                        summary_stats[genome_id].coding_density_11 * 100))
            fout.close()

            checksum = sha256(new_aa_gene_file)
            fout = open(new_aa_gene_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()
Exemplo n.º 33
0
    def __tigrfam_worker(self, queue_in, queue_out):
        """Process each data item in parallel."""
        tigrfam_version = 'tigrfam_15.0'
        tigrfam_extension = f'_{tigrfam_version}.tsv'
        tigrfam_tophit_extension = f'_{tigrfam_version}_tophit.tsv'

        symlink_tigrfam_extension = '_tigrfam.tsv'
        symlink_tigrfam_tophit_extension = '_tigrfam_tophit.tsv'

        while True:
            gene_file = queue_in.get(block=True, timeout=None)
            if gene_file == None:
                break

            assembly_dir, filename = os.path.split(gene_file)
            make_sure_path_exists(os.path.join(assembly_dir, tigrfam_version))

            output_hit_file = os.path.join(
                assembly_dir, tigrfam_version,
                filename.replace(self.protein_file_ext, tigrfam_extension))
            hmmsearch_out = os.path.join(
                assembly_dir, tigrfam_version,
                filename.replace(self.protein_file_ext,
                                 f'_{tigrfam_version}.out'))
            cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu 1 %s %s' % (
                hmmsearch_out, output_hit_file, self.tigrfam_hmms, gene_file)
            os.system(cmd)
            #==================================================================
            # print(cmd)
            #==================================================================

            # calculate checksum
            checksum = sha256(output_hit_file)
            fout = open(output_hit_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()

            # determine top hits
            tigrfam_tophit_file = os.path.join(
                assembly_dir, tigrfam_version,
                filename.replace(self.protein_file_ext,
                                 tigrfam_tophit_extension))
            self._tigr_top_hit(output_hit_file, tigrfam_tophit_file)

            # create symlink in prodigal_folder
            new_hit_link = os.path.join(
                assembly_dir,
                filename.replace(self.protein_file_ext,
                                 symlink_tigrfam_extension))
            new_tophit_link = os.path.join(
                assembly_dir,
                filename.replace(self.protein_file_ext,
                                 symlink_tigrfam_tophit_extension))

            #==================================================================
            # print(f'{new_hit_link} will point to {output_hit_file}')
            # print(f'{new_tophit_link} will point to {tigrfam_tophit_file}')
            #==================================================================

            os.symlink(output_hit_file, new_hit_link)
            os.symlink(tigrfam_tophit_file, new_tophit_link)

            # allow results to be processed or written to file
            queue_out.put(gene_file)
Exemplo n.º 34
0
    def run_hmmsearch(self, gtdb_genome_path_file, report, db):
        extension = ""
        name = ""
        worker = None
        if db == 'pfam':
            marker_folder = 'pfam_33.1'
            full_extension = '_pfam_33.1.tsv'
            symlink_extension = '_pfam.tsv'
            name = 'Pfam'
            worker = self.__pfam_worker
        elif db == 'tigrfam':
            marker_folder = 'tigrfam_15.0'
            full_extension = '_tigrfam_15.0.tsv'
            symlink_extension = '_tigrfam.tsv'
            #extension = '_tigrfam_15.0.tsv'
            name = 'Tigrfam'
            worker = self.__tigrfam_worker
        genomes_to_consider = set()
        for line in open(report):
            line_split = line.strip().split('\t')
            genome_id = line_split[1]

            attributes = line_split[2].split(';')
            for attribute in attributes:
                if attribute == 'new' or attribute == 'modified':
                    genomes_to_consider.add(genome_id)

        self.logger.info(
            f'Identified {len(genomes_to_consider)} genomes as new or modified.'
        )

        # get path to all unprocessed genome gene files
        self.logger.info('Checking genomes.')
        genome_files = []
        countr = 0
        for line in open(gtdb_genome_path_file):
            countr += 1
            statusStr = '{} lines read.'.format(countr)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            line_split = line.strip().split('\t')

            gid = line_split[0]
            gpath = line_split[1]
            assembly_id = os.path.basename(os.path.normpath(gpath))

            prodigal_dir = os.path.join(gpath, 'prodigal')
            marker_file = os.path.join(prodigal_dir, marker_folder,
                                       gid + full_extension)
            if os.path.exists(marker_file):
                #print("File exists: {}".format(marker_file))
                # verify checksum
                checksum_file = marker_file + '.sha256'
                if os.path.exists(checksum_file):
                    checksum = sha256(marker_file)
                    cur_checksum = open(checksum_file).readline().strip()
                    if checksum == cur_checksum:
                        if gid in genomes_to_consider:
                            self.logger.warning(
                                f'Genome {gid} is marked as new or modified, but already has {name} annotations.'
                            )
                            self.logger.warning('Genome is being skipped!')
                        continue

                self.logger.warning(
                    f'Genome {gid} has {name} annotations, but an invalid checksum and was not marked for reannotation.'
                )
                self.logger.warning(f'Genome will be reannotated.')

            elif gid not in genomes_to_consider:
                self.logger.warning(
                    f'Genome {gid} has no {name} annotations, but is also not marked for processing?'
                )
                self.logger.warning(f'Genome will be reannotated!')

            gene_file = os.path.join(prodigal_dir, gid + self.protein_file_ext)
            if os.path.exists(gene_file):
                if os.stat(gene_file).st_size == 0:
                    self.logger.warning(
                        f' Protein file appears to be empty: {gene_file}')
                else:
                    genome_files.append(gene_file)

        self.logger.info(f'Number of unprocessed genomes: {len(genome_files)}')

        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for f in genome_files:
            workerQueue.put(f)

        for _ in range(self.cpus):
            workerQueue.put(None)

        try:
            workerProc = [
                mp.Process(target=worker, args=(workerQueue, writerQueue))
                for _ in range(self.cpus)
            ]
            writeProc = mp.Process(target=self.__progress,
                                   args=(len(genome_files), writerQueue))

            writeProc.start()

            for p in workerProc:
                p.start()

            for p in workerProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            for p in workerProc:
                p.terminate()

            writeProc.terminate
Exemplo n.º 35
0
    def run(self, gtdb_genome_path_file):

        genomes_to_consider = None

        # get path to all genome files
        self.logger.info('Reading genomes.')
        genome_files = []
        countr = 0
        for line in open(gtdb_genome_path_file):
            countr += 1
            statusStr = '{} lines read.'.format(countr)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            line_split = line.strip().split('\t')

            gid = line_split[0]
            gpath = line_split[1]
            assembly_id = os.path.basename(os.path.normpath(gpath))

            #genome_file = os.path.join(gpath, assembly_id + '_genomic.fna')
            #gff_file = os.path.join(gpath, 'prodigal', gid + '_protein.gff')

            trna_dir = os.path.join(gpath, 'trna')

            trna_file = os.path.join(trna_dir, gid + '_trna.tsv')
            if os.path.exists(trna_file):
                # verify checksum
                checksum_file = trna_file + '.sha256'
                if os.path.exists(checksum_file):
                    checksum = sha256(trna_file)
                    cur_checksum = open(checksum_file).readline().strip()
                    if checksum == cur_checksum:
                        if genomes_to_consider and gid in genomes_to_consider:
                            self.logger.warning(
                                f'Genome {gid} is marked as new or modified, but already has tRNAs called.'
                            )
                            self.logger.warning('Genome is being skipped!')
                        continue

                self.logger.warning(
                    f'Genome {gid} has tRNAs called, but an invalid checksum and was not marked for reannotation.'
                )
                self.logger.warning('[WARNING] Genome will be reannotated.')

            elif genomes_to_consider and (gid not in genomes_to_consider):
                self.logger.warning(
                    f'Genome {gid} has no Pfam annotations, but is also not marked for processing?'
                )
                self.logger.warning('Genome will be reannotated!')

            genome_file = os.path.join(gpath,
                                       assembly_id + self.genome_file_ext)
            if os.path.exists(genome_file):
                if os.stat(genome_file).st_size == 0:
                    self.logger.warning(
                        f'Genome file appears to be empty: {gid}')
                else:
                    genome_files.append(genome_file)
        self.logger.info(
            f' Number of unprocessed genomes: {len(genome_files)}')

        # populate worker queue with data to process
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for f in genome_files:
            workerQueue.put(f)

        for _ in range(self.cpus):
            workerQueue.put(None)

        try:
            workerProc = [
                mp.Process(target=self.__workerThread,
                           args=(workerQueue, writerQueue))
                for _ in range(self.cpus)
            ]
            writeProc = mp.Process(target=self.__writerThread,
                                   args=(len(genome_files), writerQueue))

            writeProc.start()

            for p in workerProc:
                p.start()

            for p in workerProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            for p in workerProc:
                p.terminate()

            writeProc.terminate()
Exemplo n.º 36
0
  def run(self, genome_dir, genome_report, threads):
    # get list of genomes to consider
    genomes_to_consider = set()
    for line in open(genome_report):
        line_split = line.strip().split('\t')
        genome_id = line_split[1]

        attributes = line_split[2].split(';')
        for attribute in attributes:
            if attribute == 'new' or attribute == 'modified':
                genomes_to_consider.add(genome_id)

    print 'Identified %d genomes as new or modified.' % len(genomes_to_consider)

    # get path to all unprocessed genome gene files
    print 'Reading genomes.'
    gene_files = []
    for species_dir in os.listdir(genome_dir):
      cur_genome_dir = os.path.join(genome_dir, species_dir)
      if os.path.isdir(cur_genome_dir):
        for assembly_id in os.listdir(cur_genome_dir):
          prodigal_dir = os.path.join(cur_genome_dir, assembly_id, 'prodigal')
          genome_id = assembly_id[0:assembly_id.find('_', 4)]
          
          ko_file = os.path.join(prodigal_dir, genome_id + '_ko.tsv')
          if os.path.exists(ko_file):
                # verify checksum
                checksum_file = ko_file + '.sha256'
                if os.path.exists(checksum_file):
                  checksum = sha256(ko_file)
                  cur_checksum = open(checksum_file).readline().strip()
                  if checksum == cur_checksum:
                    if genome_id in genomes_to_consider:
                        print '[WARNING] Genome %s is marked as new or modified, but already has KO annotations.' % genome_id
                        print '[WARNING] Genome is being skipped!'
                    continue
                    
                print '[WARNING] Genome %s has KO annotations, but an invalid checksum and was not marked for reannotation.' % genome_id
                print '[WARNING] Genome will be reannotated.'
      
          elif genome_id not in genomes_to_consider:
            print '[WARNING] Genome %s has no KO annotations, but is also not marked for processing?' % genome_id
            print '[WARNING] Genome will be reannotated!'

          gene_file = os.path.join(prodigal_dir, genome_id + self.protein_file_ext)
          if os.path.exists(gene_file):
            if os.stat(gene_file).st_size == 0:
                print '[Warning] Protein file appears to be empty: %s' % gene_file
            else:
                gene_files.append(gene_file)

    print '  Number of unprocessed genomes: %d' % len(gene_files)

    # populate worker queue with data to process
    workerQueue = mp.Queue()
    writerQueue = mp.Queue()

    for f in gene_files:
      workerQueue.put(f)

    for _ in range(threads):
      workerQueue.put(None)

    try:
      workerProc = [mp.Process(target = self.__workerThread, args = (workerQueue, writerQueue)) for _ in range(threads)]
      writeProc = mp.Process(target = self.__writerThread, args = (len(gene_files), writerQueue))

      writeProc.start()

      for p in workerProc:
        p.start()

      for p in workerProc:
        p.join()

      writerQueue.put(None)
      writeProc.join()
    except:
      for p in workerProc:
        p.terminate()

      writeProc.terminate()
Exemplo n.º 37
0
  def run(self, genome_dir, genome_report, threads):
    # get list of genomes to consider
    genomes_to_consider = set()
    for line in open(genome_report):
        line_split = line.strip().split('\t')
        genome_id = line_split[1]
        
        attributes = line_split[2].split(';')
        for attribute in attributes:
            if attribute == 'new' or attribute == 'modified':
                genomes_to_consider.add(genome_id)

    print 'Identified %d genomes as new or modified.' % len(genomes_to_consider)
            
    # get path to all unprocessed genome gene files
    print 'Reading genomes.'
    gene_files = []
    for species_dir in os.listdir(genome_dir):
      cur_genome_dir = os.path.join(genome_dir, species_dir)
      if os.path.isdir(cur_genome_dir):
        for assembly_id in os.listdir(cur_genome_dir):
          prodigal_dir = os.path.join(cur_genome_dir, assembly_id, 'prodigal')
          genome_id = assembly_id[0:assembly_id.find('_', 4)]
          
          pfam_file = os.path.join(prodigal_dir, genome_id + '_pfam.tsv')
          if os.path.exists(pfam_file):
                # verify checksum
                checksum_file = pfam_file + '.sha256'
                if os.path.exists(checksum_file):
                  checksum = sha256(pfam_file)
                  cur_checksum = open(checksum_file).readline().strip()
                  if checksum == cur_checksum:
                    if genome_id in genomes_to_consider:
                        print '[WARNING] Genome %s is marked as new or modified, but already has Pfam annotations.' % genome_id
                        print '[WARNING] Genome is being skipped!'
                    continue
                    
                print '[WARNING] Genome %s has Pfam annotations, but an invalid checksum and was not marked for reannotation.' % genome_id
                print '[WARNING] Genome will be reannotated.'
      
          elif genome_id not in genomes_to_consider:
            print '[WARNING] Genome %s has no Pfam annotations, but is also not marked for processing?' % genome_id
            print '[WARNING] Genome will be reannotated!'

          gene_file = os.path.join(prodigal_dir, genome_id + self.protein_file_ext)
          if os.path.exists(gene_file):
            if os.stat(gene_file).st_size == 0:
                print '[Warning] Protein file appears to be empty: %s' % gene_file
            else:
                gene_files.append(gene_file)

    print '  Number of unprocessed genomes: %d' % len(gene_files)

    # populate worker queue with data to process
    workerQueue = mp.Queue()
    writerQueue = mp.Queue()

    for f in gene_files:
      workerQueue.put(f)

    for _ in range(threads):
      workerQueue.put(None)

    try:
      workerProc = [mp.Process(target = self.__workerThread, args = (workerQueue, writerQueue)) for _ in range(threads)]
      writeProc = mp.Process(target = self.__writerThread, args = (len(gene_files), writerQueue))

      writeProc.start()

      for p in workerProc:
        p.start()

      for p in workerProc:
        p.join()

      writerQueue.put(None)
      writeProc.join()
    except:
      for p in workerProc:
        p.terminate()

      writeProc.terminate()
Exemplo n.º 38
0
    def run(self, genome_dir, domain, genome_list, threads):
        genomes_to_process = None
        if genome_list:
            genomes_to_process = set()
            for line in open(genome_list):
                line_split = line.strip().split("\t")
                genome_id = line_split[0]
                if genome_id.startswith("GB_") or genome_id.startswith("RS_"):
                    genome_id = genome_id[3:]

                genomes_to_process.add(genome_id)

        # get path to all unprocessed genome gene files
        print "Reading genomes."
        genome_files = []
        for genome_id in os.listdir(genome_dir):
            cur_genome_dir = os.path.join(genome_dir, genome_id)
            if os.path.isdir(cur_genome_dir):
                for assembly_id in os.listdir(cur_genome_dir):
                    assembly_dir = os.path.join(cur_genome_dir, assembly_id)

                    genome_id = assembly_id[0 : assembly_id.find("_", 4)]
                    if genomes_to_process and genome_id not in genomes_to_process:
                        continue

                    prokka_dir = os.path.join(assembly_dir, "prokka")
                    if os.path.exists(prokka_dir):
                        continue

                    prokka_file = os.path.join(prokka_dir, assembly_id + ".faa")
                    if os.path.exists(prokka_file):
                        # verify checksum
                        checksum_file = prokka_file + ".sha256"
                        if os.path.exists(checksum_file):
                            checksum = sha256(prokka_file)
                            cur_checksum = open(checksum_file).readline().strip()
                            if checksum == cur_checksum:
                                continue

                    genome_file = os.path.join(assembly_dir, assembly_id + self.genome_file_ext)
                    if os.path.exists(genome_file):
                        genome_files.append(genome_file)

        print "  Number of unprocessed genomes: %d\n" % len(genome_files)

        # populate worker queue with data to process
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for f in genome_files:
            workerQueue.put(f)

        for _ in range(threads):
            workerQueue.put(None)

        try:
            workerProc = [
                mp.Process(target=self.__workerThread, args=(domain, workerQueue, writerQueue)) for _ in range(threads)
            ]
            writeProc = mp.Process(target=self.__writerThread, args=(len(genome_files), writerQueue))

            writeProc.start()

            for p in workerProc:
                p.start()

            for p in workerProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            for p in workerProc:
                p.terminate()

            writeProc.terminate()
Exemplo n.º 39
0
    def _run_prodigal(self, genome_paths):
        """Run Prodigal on genomes."""

        # get genome path and translation table for each file
        self.logger.info('Determining genomic file and translation table for each of the %d genomes.' % len(genome_paths))
        genome_files = []
        translation_table = {}
        for gid, gpath in genome_paths.items():
            assembly_id = os.path.basename(os.path.normpath(gpath))
            canonical_gid = assembly_id[0:assembly_id.find('_', 4)]
            
            genome_file = os.path.join(gpath, assembly_id + '_genomic.fna')
            if os.path.exists(genome_file):
                if os.stat(genome_file).st_size == 0:
                    self.logger.warning('Genomic file appears to be empty: %s' % genome_file)
                    continue
                
                genome_files.append(genome_file)
            else:
                self.logger.warning('Genomic file appears to be missing: %s' % genome_file)
                    
            gff_file = os.path.join(gpath, assembly_id + '_genomic.gff')
            if os.path.exists(gff_file):
                if os.stat(gff_file).st_size == 0:
                    self.logger.warning('GFF appears to be empty: %s' % gff_file)
                    continue

                tt = self._parse_translation_table(gff_file)
                if tt:
                    translation_table[canonical_gid] = tt
                else:
                    translation_table[canonical_gid] = None
                    self.logger.warning('Unable to determine translation table for: %s' % gff_file)
                    sys.exit(-1)
            else:
                self.logger.warning('GFF appears to be missing: %s' % gff_file)
                sys.exit(-1)
        
        # run Prodigal on each genome
        self.logger.info('Running Prodigal on %d genomes.' % len(genome_paths))
        prodigal = Prodigal(cpus=self.cpus)
        summary_stats = prodigal.run(genome_files, 
                                    translation_table=translation_table, 
                                    output_dir=self.tmp_dir)

        # move results into individual genome directories
        self.logger.info('Moving files and calculating checksums.')
        for genome_file in genome_files:
            genome_path, genome_id = ntpath.split(genome_file)
            genome_id = remove_extension(genome_id)
            canonical_gid = genome_id[0:genome_id.find('_', 4)]
            
            aa_gene_file = os.path.join(self.tmp_dir, genome_id + '_genes.faa')
            nt_gene_file = os.path.join(self.tmp_dir, genome_id + '_genes.fna')
            gff_file = os.path.join(self.tmp_dir, genome_id + '.gff')

            genome_root = genome_id[0:genome_id.find('_', 4)]
            prodigal_path = os.path.join(genome_path, 'prodigal')
            if not os.path.exists(prodigal_path):
                os.makedirs(prodigal_path)
            new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa')
            new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna')
            new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff')

            os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file))
            os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file))
            os.system('mv %s %s' % (gff_file, new_gff_file))

            # save translation table information
            translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv')
            fout = open(translation_table_file, 'w')
            if translation_table[canonical_gid]:
                fout.write('%s\t%d\t%s\n' % ('best_translation_table', 
                                                summary_stats[genome_id].best_translation_table,
                                                'used table specified by NCBI'))
            else:
                fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table))
                fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100))
                fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100))
            fout.close()

            checksum = sha256(new_aa_gene_file)
            fout = open(new_aa_gene_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()
Exemplo n.º 40
0
    def moveGenomes(self, db_genome_ids):
        """Move genome files into database directory structure.

        This function assumes addGenomes() has been called. It is
        not directly called by addGenomes() as all database
        queries are performed before moving genomes.

        Parameters
        ----------
        db_genome_ids : list
            Unique database identifiers for genomes.
        """

        assert self.tmp_output_dir

        # get database genome identifiers
        self.cur.execute(
            "SELECT genomes.id,user_editable, external_id_prefix || '_' || id_at_source as external_id "
            + "FROM genomes, genome_sources "
            + "WHERE genome_source_id = genome_sources.id "
            + "AND genomes.id in %s",
            (tuple(db_genome_ids),),
        )

        external_id_dict = {}
        for (genome_id, user_editable, external_id) in self.cur:
            if user_editable:
                external_id_dict[genome_id] = external_id

        if len(external_id_dict.keys()) > 0:
            username = None
            if self.currentUser.isRootUser():
                username = self.currentUser.getElevatedFromUsername()
            else:
                username = self.currentUser.getUsername()

            if username is None:
                raise GenomeDatabaseError("Unable to determine user to add genomes under.")

        gtdb_target_dir = os.path.join(self.genomeCopyDir, username)
        for db_genome_id, external_id in external_id_dict.items():
            tmp_genome_dir = os.path.join(self.tmp_output_dir, external_id)

            genome_target_dir = os.path.join(gtdb_target_dir, external_id)
            if os.path.exists(genome_target_dir):
                raise GenomeDatabaseError("Genome directory already exists: %s" % genome_target_dir)

            shutil.move(tmp_genome_dir, genome_target_dir)

            self.cur.execute(
                "UPDATE genomes SET fasta_file_location = %s , genes_file_location = %s , genes_file_sha256 = %s WHERE id = %s",
                (
                    os.path.join(username, external_id, external_id + self.genomeFileSuffix),
                    os.path.join(username, external_id, self.userAnnotationDir, external_id + self.proteinFileSuffix),
                    sha256(
                        os.path.join(genome_target_dir, self.userAnnotationDir, external_id + self.proteinFileSuffix)
                    ),
                    db_genome_id,
                ),
            )

        shutil.rmtree(self.tmp_output_dir)
Exemplo n.º 41
0
    def run(self, genome_dir, domain, threads):

        genomes_to_consider = None

        # get path to all genome files
        print 'Reading genomes.'
        genome_files = []
        for species_dir in os.listdir(genome_dir):
            cur_genome_dir = os.path.join(genome_dir, species_dir)
            if os.path.isdir(cur_genome_dir):
                for assembly_id in os.listdir(cur_genome_dir):
                    assembly_dir = os.path.join(cur_genome_dir, assembly_id)
                    trna_dir = os.path.join(assembly_dir, 'trna')
                    genome_id = assembly_id[0:assembly_id.find('_', 4)]

                    trna_file = os.path.join(trna_dir, genome_id + '_trna.tsv')
                    if os.path.exists(trna_file):
                        # verify checksum
                        checksum_file = trna_file + '.sha256'
                        if os.path.exists(checksum_file):
                            checksum = sha256(trna_file)
                            cur_checksum = open(
                                checksum_file).readline().strip()
                            if checksum == cur_checksum:
                                if genomes_to_consider and genome_id in genomes_to_consider:
                                    print '[WARNING] Genome %s is marked as new or modified, but already has tRNAs called.' % genome_id
                                    print '[WARNING] Genome is being skipped!'
                                continue

                        print '[WARNING] Genome %s has tRNAs called, but an invalid checksum and was not marked for reannotation.' % genome_id
                        print '[WARNING] Genome will be reannotated.'

                    elif genomes_to_consider and (genome_id
                                                  not in genomes_to_consider):
                        print '[WARNING] Genome %s has no Pfam annotations, but is also not marked for processing?' % genome_id
                        print '[WARNING] Genome will be reannotated!'

                    genome_file = os.path.join(
                        assembly_dir, assembly_id + self.genome_file_ext)
                    if os.path.exists(genome_file):
                        if os.stat(genome_file).st_size == 0:
                            print '[Warning] Genome file appears to be empty: %s' % genome_file
                        else:
                            genome_files.append(genome_file)

        print '  Number of unprocessed genomes: %d' % len(genome_files)

        # populate worker queue with data to process
        workerQueue = mp.Queue()
        writerQueue = mp.Queue()

        for f in genome_files:
            workerQueue.put(f)

        for _ in range(threads):
            workerQueue.put(None)

        try:
            workerProc = [
                mp.Process(target=self.__workerThread,
                           args=(workerQueue, writerQueue, domain))
                for _ in range(threads)
            ]
            writeProc = mp.Process(target=self.__writerThread,
                                   args=(len(genome_files), writerQueue))

            writeProc.start()

            for p in workerProc:
                p.start()

            for p in workerProc:
                p.join()

            writerQueue.put(None)
            writeProc.join()
        except:
            for p in workerProc:
                p.terminate()

            writeProc.terminate()
Exemplo n.º 42
0
    def run(self, sra_path):

        query = ("SELECT last_auto_id from genome_sources where id =1")
        self.temp_cur.execute(query)
        last_id = int(self.temp_cur.fetchone()[0])

        checkm_dict_original = self.parsecheckm(sra_path)

        print os.getlogin()
        sra_dirs = os.listdir(sra_path)
        for sra_dir in sra_dirs:
            temp_path = tempfile.mkdtemp()
            sra_dir = os.path.join(sra_path, sra_dir)
            print sra_dir
            if os.path.isdir(sra_dir):
                bins_dir = os.path.join(sra_dir, 'bins')
                genomes_bin = os.listdir(bins_dir)
                for genome in genomes_bin:
                    if genome.endswith(".fna"):
                        genome_prefix = genome[:-4]
                        last_id += 1
                        temp_user_dir = os.path.join(temp_path,
                                                     "U_" + str(last_id))
                        os.mkdir(temp_user_dir)
                        print temp_user_dir
                        print genome_prefix

                        shutil.copyfile(
                            os.path.join(bins_dir, genome),
                            os.path.join(temp_user_dir,
                                         "U_" + str(last_id) + "_genomic.fna"))

                        metadata_dir = os.path.join(sra_dir, 'metadata',
                                                    genome_prefix)
                        self.copytree(metadata_dir, temp_user_dir)

                        prodigal_dir = os.path.join(temp_user_dir, 'prodigal')
                        os.mkdir(prodigal_dir)

                        for old_name in glob.glob(sra_dir + "/prodigal/" +
                                                  genome_prefix + "_*"):
                            new_name = old_name.replace(
                                sra_dir + "/prodigal", prodigal_dir)
                            new_name = new_name.replace(
                                genome_prefix, "U_" + str(last_id))
                            shutil.copy(old_name, new_name)

                        for old_name in glob.glob(sra_dir + "/pfam/" +
                                                  genome_prefix + "_*"):
                            new_name = old_name.replace(
                                sra_dir + "/pfam", prodigal_dir)
                            new_name = new_name.replace(
                                genome_prefix, "U_" + str(last_id))
                            shutil.copy(old_name, new_name)

                        for old_name in glob.glob(sra_dir + "/tigrfam/" +
                                                  genome_prefix + "_*"):
                            new_name = old_name.replace(
                                sra_dir + "/tigrfam", prodigal_dir)
                            new_name = new_name.replace(
                                genome_prefix, "U_" + str(last_id))
                            shutil.copy(old_name, new_name)

                        list_genome_details = [genome_prefix]
                        list_genome_details.append(
                            genome_prefix + " (21/07/2016)")  # description
                        list_genome_details.append(False)
                        list_genome_details.append(30)
                        fasta_file_path = os.path.join(
                            os.path.join(os.getlogin(), genome_prefix,
                                         genome_prefix + "_genomic.fna"))
                        list_genome_details.append(fasta_file_path)
                        list_genome_details.append(
                            sha256(
                                os.path.join(
                                    temp_user_dir,
                                    "U_" + str(last_id) + "_genomic.fna")))
                        list_genome_details.append(1)
                        list_genome_details.append(str(last_id))
                        list_genome_details.append("21-07-2016")
                        list_genome_details.append(False)
                        list_genome_details.append("21-07-2016")
                        gene_file_path = os.path.join(
                            os.getlogin(), genome_prefix, "prodigal",
                            genome_prefix + "_protein.faa")
                        list_genome_details.append(gene_file_path)
                        list_genome_details.append(
                            sha256(
                                os.path.join(
                                    temp_user_dir, "prodigal",
                                    "U_" + str(last_id) + "_protein.faa")))

                        self.temp_cur.execute(
                            "INSERT INTO genomes " +
                            "(name,description,owned_by_root,owner_id,fasta_file_location,fasta_file_sha256,genome_source_id,id_at_source,date_added,has_changed,last_update,genes_file_location,genes_file_sha256) "
                            +
                            "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) RETURNING id ",
                            list_genome_details)
                        (new_gid, ) = self.temp_cur.fetchone()
                        self.temp_cur.execute(
                            "INSERT INTO metadata_nucleotide (id) VALUES ({0})"
                            .format(new_gid))
                        self.temp_cur.execute(
                            "INSERT INTO metadata_genes (id) VALUES ({0})".
                            format(new_gid))
                        self.temp_cur.execute(
                            "INSERT INTO metadata_taxonomy (id) VALUES ({0})".
                            format(new_gid))
                        self.temp_cur.execute(
                            "INSERT INTO metadata_ssu (id) VALUES ({0})".
                            format(new_gid))

                        # insertion of metadata
                        with open(
                                os.path.join(
                                    temp_user_dir,
                                    "metadata.genome_nt.tsv")) as metntf:
                            for line in metntf:
                                line_tab = line.strip().split()
                                self.temp_cur.execute(
                                    "UPDATE metadata_nucleotide set {0}=%s WHERE id =%s "
                                    .format(line_tab[0]),
                                    (line_tab[1], new_gid))

                        # insertion of metadata
                        with open(
                                os.path.join(
                                    temp_user_dir,
                                    "metadata.genome_gene.tsv")) as metgenef:
                            for line in metgenef:
                                line_tab = line.strip().split()
                                self.temp_cur.execute(
                                    "UPDATE metadata_genes set {0}=%s WHERE id =%s "
                                    .format(line_tab[0]),
                                    (line_tab[1], new_gid))

                        for key, value in checkm_dict_original.get(
                                genome_prefix).iteritems():
                            self.temp_cur.execute(
                                "UPDATE metadata_genes set {0}=%s WHERE id =%s "
                                .format(key), (value, new_gid))

                        shutil.copytree(
                            os.path.join(temp_user_dir),
                            os.path.join("/srv/db/gtdb/genomes/user",
                                         os.getlogin(), "U_" + str(last_id)))

                        self.temp_con.commit()
            shutil.rmtree(temp_path)
Exemplo n.º 43
0
    def _addGenomeToDB(self, fasta_file_path, name, desc, source, id_at_source, gene_path):
        """Add genome to database.

        Parameters
        ----------
        fasta_file_path : str
            Path to genome FASTA file with nucleotide sequences.
        name : str
            Desired name of genome.
        desc : str
            Description of genome.
        source : str
            Source of genome.
        id_at_source : int
            ?
        gene_path : str
            Path to called genes in amino acid space.

        Returns
        -------
        str
            Database identifier of genome.
        """
        try:
            fasta_sha256_checksum = sha256(fasta_file_path)

            gene_sha256_checksum = None
            if gene_path is not None:
                gene_sha256_checksum = sha256(gene_path)
            if source is None:
                source = self.defaultGenomeSourceName

            self.cur.execute(
                "SELECT id, external_id_prefix, user_editable FROM genome_sources WHERE name = %s", (source,)
            )
            source_id = None

            for (db_id, _external_id_prefix, user_editable) in self.cur:
                if not user_editable:
                    if id_at_source is None:
                        raise GenomeDatabaseError(
                            "Cannot auto generate ids at source for the %s genome source." % source
                        )
                    if not self.currentUser.isRootUser():
                        raise GenomeDatabaseError(
                            "Only the root user can add genomes to the %s genome source." % source
                        )
                source_id = db_id
                break

            if source_id is None:
                raise GenomeDatabaseError("Could not find the %s genome source." % source)

            if id_at_source is None:
                # We use update to return a value. This update should fix the concurreny of multit thread using the same value. Update locks the cell during the transaction.
                self.cur.execute("SELECT update_last_auto(%s);", (source_id,))
                id_at_source = str(self.cur.fetchone()[0])

            added = datetime.datetime.now()

            owner_id = None
            if not self.currentUser.isRootUser():
                owner_id = self.currentUser.getUserId()

            self.cur.execute(
                "SELECT id FROM genomes WHERE genome_source_id = %s AND id_at_source = %s", (source_id, id_at_source)
            )

            result = self.cur.fetchall()

            columns = (
                "(name, description, owned_by_root, owner_id, fasta_file_location, "
                + "fasta_file_sha256, genes_file_location, genes_file_sha256,genome_source_id, id_at_source, date_added)"
            )

            if len(result):
                raise GenomeDatabaseError(
                    "Genome source '%s' already contains id '%s'. Use -f to force an overwrite."
                    % (source, id_at_source)
                )

            self.cur.execute(
                "INSERT INTO genomes " + columns + " "
                "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) " + "RETURNING id",
                (
                    name,
                    desc,
                    self.currentUser.isRootUser(),
                    owner_id,
                    fasta_file_path,
                    fasta_sha256_checksum,
                    gene_path,
                    gene_sha256_checksum,
                    source_id,
                    id_at_source,
                    added,
                ),
            )
            (db_genome_id,) = self.cur.fetchone()

            return db_genome_id

        except GenomeDatabaseError as e:
            raise e
Exemplo n.º 44
0
    def run(self, input_dir, tmp_dir, threads):
        # get path to all unprocessed genome files
        print 'Reading genomes.'
        genome_files = []
        for genome_dir in os.listdir(input_dir):
            cur_genome_dir = os.path.join(input_dir, genome_dir)
            if not os.path.isdir(cur_genome_dir):
                continue
              
            for assembly_id in os.listdir(cur_genome_dir):
                assembly_dir = os.path.join(cur_genome_dir, assembly_id)
                genome_id = assembly_id[0:assembly_id.find('_', 4)]

                # check if prodigal has already been called
                if False:
                    # for safety, I am just recalling genes for all genomes right now,
                    # but this is very efficient
                    aa_gene_file = os.path.join(assembly_dir, 'prodigal', genome_id + '_protein.faa')
                    if os.path.exists(aa_gene_file):
                        # verify checksum
                        checksum_file = aa_gene_file + '.sha256'
                        if os.path.exists(checksum_file):
                            checksum = sha256(aa_gene_file)
                            cur_checksum = open(checksum_file).readline().strip()
                            if checksum == cur_checksum:
                                continue

                genome_file = os.path.join(assembly_dir, assembly_id + '_genomic.fna')
                if os.path.exists(genome_file):
                    if os.stat(genome_file).st_size == 0:
                        print '[Warning] Genome file appears to be empty: %s' % genome_file
                    else:
                        genome_files.append(genome_file)

        print '  Number of unprocessed genomes: %d' % len(genome_files)

        # run prodigal on each genome
        print 'Running prodigal.'
        prodigal = Prodigal(cpus=threads)
        summary_stats = prodigal.run(genome_files, output_dir=tmp_dir)

        # move results into individual genome directories
        print 'Moving files and calculating checksums.'
        for genome_file in genome_files:
            genome_path, genome_id = ntpath.split(genome_file)
            genome_id = remove_extension(genome_id)
            
            aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa')
            nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna')
            gff_file = os.path.join(tmp_dir, genome_id + '.gff')

            genome_root = genome_id[0:genome_id.find('_', 4)]
            prodigal_path = os.path.join(genome_path, 'prodigal')
            if not os.path.exists(prodigal_path):
                os.makedirs(prodigal_path)
            new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa')
            new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna')
            new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff')

            os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file))
            os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file))
            os.system('mv %s %s' % (gff_file, new_gff_file))

            # save translation table information
            translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv')
            fout = open(translation_table_file, 'w')
            fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table))
            fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100))
            fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100))
            fout.close()

            checksum = sha256(new_aa_gene_file)
            fout = open(new_aa_gene_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()