Exemplo n.º 1
0
 def __init__(self, infile, outfile, logpath=None, outfmt="fasta"):
     """Set up the logger and the parameters."""
     self.infile = infile
     self.outfile = outfile
     self.outfmt = outfmt
     self.logpath = logpath
     self.clustalolog = LogIt().default('clustalo', logfile=self.logpath)
Exemplo n.º 2
0
    def __init__(self,
                 project,
                 project_path=None,
                 solo=False,
                 multi=True,
                 archive=False,
                 min_fasta=True,
                 blast=OrthoBlastN,
                 **kwargs):
        """Handle GenBank files in various ways.

        It allows for refseq-release .gbff files to be downloaded from NCBI
        and uploaded to a BioSQL database (biopython).  Single .gbk files can be
        downloaded from the .gbff, and uploaded to a custom BopSQL database for
        faster acquisition of GenBank data.

        :param project:  The name of the project.
        :param project_path: The relative path to the project.
        :param solo:  A flag for adding single fasta files.
        :param multi:  A flag for adding multi-fasta files.
        :param archive: A flag for archiving current GenBank Data.  # TODO
        :param min_fasta: A flag for minimizing FASTA file headers.
        :param blast:  The blast parameter is used for composing various
                       Orthologs.Blast classes.  Can be a class, a dict,
                       or none.
        :returns:  .gbff files/databases, .gbk files/databases, & FASTA files.
        """

        # TODO-ROB: Change the way the file systems work.
        self.project = project
        self.project_path = project_path
        self.solo = solo
        self.multi = multi
        self.min_fasta = min_fasta
        self.genbanklog = LogIt().default(logname="GenBank", logfile=None)

        # Configuration of class attributes
        add_self = attribute_config(self,
                                    composer=blast,
                                    checker=OrthoBlastN,
                                    checker2=BaseComparativeGenetics,
                                    project=project,
                                    project_path=project_path)
        for var, attr in add_self.__dict__.items():
            setattr(self, var, attr)

        # Configuration
        # FIXME AttributeError: 'GenBank' object has no attribute 'user_db'
        self.target_gbk_db_path = self.user_db / Path(self.project)
        Path.mkdir(self.target_gbk_db_path, parents=True, exist_ok=True)

        # Make a list of BioSQL database(.db) files that contain GenBank info
        self.db_files_list = []
        for FILE in os.listdir(str(self.ncbi_db_repo)):
            if FILE.endswith('.db'):
                self.db_files_list.append(str(FILE))
Exemplo n.º 3
0
    def __init__(self, base_jobname):
        """Initialize job attributes."""
        self.base_jobname = base_jobname
        self.default_job_attributes = __DEFAULT__
        self.file2str = file2str
        self.sgejob_log = LogIt().default(logname="SGE JOB", logfile=None)
        self.pbsworkdir = os.getcwd()

        # Import the temp.pbs file using pkg_resources
        self.temp_pbs = resource_filename(templates.__name__, "temp.pbs")
Exemplo n.º 4
0
 def __init__(self, logname):
     """
     A class that individually threads the capture of the stdout stream and the stderr stream of a system command.
     The stdout/stderr are queued in the order they occur.  As the que populates another thread, parse the que and
     prints to the screen using the LogIT class.
     """
     self.io_q = Queue()
     self.process = None
     self.streamieolog = LogIt().default(logname="%s - streamieo" % logname,
                                         logfile=None)
Exemplo n.º 5
0
    def __init__(self,
                 project=None,
                 project_path=os.getcwd(),
                 genbank=GenBank,
                 **kwargs):
        """Initialize the MultipleSequenceAlignment class.

        :param project: The project name.
        :param project_path:  The path to the project.
        :param genbank: The composer parameter which is used to configure the
                        GenBank class with the MSA class.
        :param kwargs:  The kwargs are used with the dispatcher as a way to
                        control the alignment pipeline.
        :returns: If the kwargs are utilized with YAML or other
                  configurations, then this class returns an alignment
        dictionary, which can be parsed to run specific alignment algorithms.
        """
        self.dispatcher_options = {
            "Guidance_config": ["GUIDANCE2", self.guidance2],
            "Pal2Nal_config": ["PAL2NAL", self.pal2nal],
            "ClustalO_config": ["CLUSTALO", self.clustalo]
        }
        # Set up loggers
        __log = LogIt()
        __logfile = None
        self.guidancelog = __log.default('guidance2', __logfile)
        self.pal2nallog = __log.default('pal2nal', __logfile)
        self.clustalolog = __log.default('clustalo', __logfile)

        # stop_codons = ['TAG', 'TAA', 'TGA']

        self.program = None
        self.alignment_dict = {}
        self.project = project
        self.project_path = project_path
        if project_path and project:
            self.project_path = Path(project_path) / Path(project)

        # Configuration of class attributes
        add_self = attribute_config(self,
                                    composer=genbank,
                                    checker=GenBank,
                                    project=project,
                                    project_path=project_path)
        for var, attr in add_self.__dict__.items():
            setattr(self, var, attr)

        # Determine which alignment to configure
        # And then run that alignment with the configuration.
        for config in self.dispatcher_options.keys():
            if config in kwargs.keys():
                program = self.dispatcher_options[config][0]
                aligner = self.dispatcher_options[config][1]
                aligner_configuration = kwargs[config]
                self.alignment_dict[program] = [aligner, aligner_configuration]
Exemplo n.º 6
0
class StreamIEO(object):
    def __init__(self, logname):
        """
        A class that individually threads the capture of the stdout stream and the stderr stream of a system command.
        The stdout/stderr are queued in the order they occur.  As the que populates another thread, parse the que and
        prints to the screen using the LogIT class.
        """
        self.io_q = Queue()
        self.process = None
        self.streamieolog = LogIt().default(logname="%s - streamieo" % logname,
                                            logfile=None)

    def streamer(self, cmd):
        self.process = Popen(cmd,
                             stdout=PIPE,
                             stderr=PIPE,
                             shell=True,
                             encoding='utf-8')
        # Add the command line to the que
        self.io_q.put(("STDIN", cmd))
        # Watch the standard output and add it to the que
        Thread(target=self._stream_watcher,
               name='stdout-watcher',
               args=('STDOUT', self.process.stdout)).start()
        # Watch the standard error and add it to the que
        Thread(target=self._stream_watcher,
               name='stderr-watcher',
               args=('STDERR', self.process.stderr)).start()
        # As items are added, print the stream.
        Thread(target=self._printer, name='_printer').start()

    def _stream_watcher(self, identifier, stream):
        # Watch the stream and add to the que dynamically
        # This runs in tandem with the printer.  So as the stdout/stderr streams are queued here,
        # the que is parsed and printed in the printer function.
        for line in stream:
            self.io_q.put((identifier, line))
        if not stream.closed:
            stream.close()

    def _printer(self):
        # Prints the que as it is populated with stdout/stderr dynamically.
        while True:
            try:
                # Block for 1 second.
                item = self.io_q.get(True, 1)
            except Empty:
                # No output in either streams for a second. Are we done?
                if self.process.poll() is not None:
                    break
            else:
                identifier, line = item
                if identifier == "STDIN":
                    self.streamieolog.warn("Command: " + line.strip())
                elif identifier == "STDERR":
                    self.streamieolog.error(line.strip())
                elif identifier == "STDOUT":
                    self.streamieolog.info(line.strip())
                else:
                    self.streamieolog.critical(identifier + ':' + line.strip())
Exemplo n.º 7
0
def attribute_config(cls, composer, checker, project=None, project_path=None, checker2=None):
    """Set/configure attributes.

    Attribute Configuration takes an instance of a class and sets various
    attributes. The attributes are set by determining the type of
    configuration. The class attributes can be composed by another class,
    they can be set with  a dictionary, or they can be set using the basic
    project template.

    :param cls: An instance of a class that will retain the attributes.
    :param composer: A class that will yield attributes to the cls parameter.
    :param checker: A checker class used to check the type of the composer.
                    Dictionary composers will be treated differently.
    :param project:  The name of the project. (Default value = None)
    :param project_path:  The relative path of the project.
                          (Default value = None)
    :param checker2:  (Default value = None)
    :return:  Returns the instance (cls) with new attributes.
    """
    clsnm = cls.__class__.__name__
    ac_log = LogIt().default(logname="%s" % clsnm, logfile=None)
    if checker2:
        check2 = issubclass(type(composer), checker2)
    else:
        check2 = None
    # Attribute configuration using checker composition.
    if issubclass(type(composer), checker) or check2:
        for key, value in composer.__dict__.items():
            setattr(cls, key, value)
        clsnm = cls.__class__.__name__
        compnm = composer.__class__.__name__
        msg = "The attribute configuration was accomplished by composing {0} with {1}.".format(clsnm, compnm)
        ac_log.info(msg)

    # Attribute configuration using a dictionary.
    elif isinstance(composer, dict):
        for key, value in composer.items():
            setattr(cls, key, value)
        clsnm = cls.__class__.__name__
        msg = "The attribute configuration of {0} was accomplished by using a dictionary.".format(clsnm)
        ac_log.info(msg)

    # Attribute configuration without composer
    elif composer is None:
        if not (project or project_path):
            msg = "Without the Project Management class, a project name and "
            "project path must be included."
            raise BrokenPipeError(msg)
        cls = standalone_config(cls, project, project_path)
        clsnm = cls.__class__.__name__
        msg = "The attribute configuration of {0} was accomplished without a composer.".format(clsnm)
        ac_log.info(msg)
    # Make sure self.project and self.project_path have values
    if not (cls.project or cls.project_path):
        msg = "The project name and project path attributes have not been set."
        raise BrokenPipeError(msg)

    return cls
Exemplo n.º 8
0
 def test_logit(self):
     """Test the LogIt class."""
     logit = LogIt()
     test = logit.default(logname='testlog', logfile=self.logfile)
     self.assertEqual(str(test.name), 'TESTLOG')
     self.assertTrue(os.path.isfile(self.logfile))
     logit.shutdown()
     logit.deletelog(self.logfile)
Exemplo n.º 9
0
    def __init__(self, infile, outfile):
        """Initialize my gene handle and refseq/accessions list.

        Get the basic gene information. It's best to use a csv file and title
        the row of the accessions list `Accessions`.

        :param infile: Input a csv file with a `Homo_sapiens` column
        :param outfile: Desired path & name of the output file.
        """
        self.infile = infile
        self.outfile = outfile

        self.mygene_log = LogIt().default('mygene', None)

        self.mg = mygene.MyGeneInfo()  # Set up mygene handle
        self.accessions_list = self._import_accfile()  # Create accessions list

        self.fields = 'symbol,name,entrezgene,summary'  # Default fields
        self.species = 'human'  # Species to use.
Exemplo n.º 10
0
class ClustalO(object):
    """Align genes using Clustal Omega.

    This class is a further wrapper around Biopython's ClustalOmegaCommandline.

    :param infile: Path/Name of multiple fasta file.
    :param outfile: Path/Name of multiple alignment file.
    :param logpath: Path to logfile. (Default = None)
    :param outfmt: Format of the output multiple alignment file.
    """
    def __init__(self, infile, outfile, logpath=None, outfmt="fasta"):
        """Set up the logger and the parameters."""
        self.infile = infile
        self.outfile = outfile
        self.outfmt = outfmt
        self.logpath = logpath
        self.clustalolog = LogIt().default('clustalo', logfile=self.logpath)

    def runclustalomega(self):
        """Run clustalomega."""

        try:
            # Run clustal omega using the multifasta file
            clustalo_cline = ClustalOmegaCommandline(
                infile=self.infile,
                cmd="clustalo",
                outfile=self.outfile,
                # "RNA"/"DNA"
                seqtype="PROTEIN",
                max_hmm_iterations=2,
                infmt="fasta",
                # "aln", "phy"
                outfmt=self.outfmt,
                iterations=3,  # Notable
                verbose=True,
                force=True,
                log=self.logpath)
            clustalo_cline()
            stdout, _ = clustalo_cline()
            self.clustalolog.info(stdout)

        except ApplicationError as err:
            self.clustalolog.error(err)
Exemplo n.º 11
0
    def __init__(self, alignment, dataType='CODON', working_dir=''):
        """Run IQTree to generate a "filtered" tree or best tree.

        :param alignment: Path to multiple sequence alignment file.
        :param dataType:  Input datatype. (Default value = 'CODON')
        :param working_dir: Path of working directory.  (Default value = '')
        """
        self.iqtree_log = LogIt().default(logname="iqtree", logfile=None)
        self.working_dir = Path(working_dir)
        self.iqtree_path = self.working_dir / Path('IQTREE')
        self.tree_file = self.iqtree_path / Path(alignment + '.treefile')
        self.gene = alignment.replace('_P2N_na.iqtree.aln', '')

        self.aln_File = str(self.working_dir / Path(alignment))
        outDir = self.working_dir / Path('IQTREE')
        makedirectory(outDir)
        copy(self.aln_File, str(outDir))
        os.chdir(str(outDir))
        self.iqtree_best_tree(alignment, dataType)
        treepath = str(self.working_dir / Path(self.gene + '_iqtree.nwk'))
        copy(self.tree_file, treepath)
Exemplo n.º 12
0
    def __init__(self,
                 database_name,
                 template_name="",
                 project=None,
                 project_path=None,
                 proj_mana=ProjectManagement,
                 **kwargs):
        """
        This is the base BioSQL class.  It provides a framework for uploading schemas, loading taxonomy data from NCBI
        and ITIS using the BioSQL perl scripts and .sql schema files provided by the BioPython package.  We have created
        a modified version of the BioSQL scripts in our package, which can be found on GitHub.  Taxonomy data can be
        found at:
            NCBI:  ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy
            ITIS:  http://www.itis.gov/downloads/

        :param database_name:  The name of the database.
        """
        # Logging setup
        self.biosqllog = LogIt().default(logname="BioSQL", logfile=None)
        self.biosqlstream = StreamIEO(logname="BioSQL")

        # Load relative and absolute paths to scripts in the BioSQL module
        self.scripts = pkg_resources.resource_filename(sql_scripts.__name__,
                                                       "")
        self.ncbi_taxon_script = pkg_resources.resource_filename(
            sql_scripts.__name__, "load_ncbi_taxonomy.pl")
        self.itis_taxon_script = pkg_resources.resource_filename(
            sql_scripts.__name__, "load_itis_taxonomy.pl")
        self.database_name = Path(database_name)

        # Configuration of class attributes for Project Management.
        if project_path and project:
            self.project_path = Path(project_path) / Path(project)

        if proj_mana:
            add_self = attribute_config(self,
                                        composer=proj_mana,
                                        checker=ProjectManagement,
                                        project=project,
                                        project_path=project_path)
            for var, attr in add_self.__dict__.items():
                setattr(self, var, attr)
            self.template_rel_path = self.user_index
            self.template_abs_path = self.template_rel_path / Path(
                template_name)
            self.databases_path = self.user_db
        else:
            self.project_path = Path(project_path) / Path(project)
            self.template_rel_path = self.project_path / Path('index')
            self.template_abs_path = self.template_rel_path / Path(
                template_name)
            self.databases_path = Path(project_path) / Path('databases')
Exemplo n.º 13
0
    def __init__(self, alignmentfile, speciestree, workdir=''):
        """Initialize main variables/files to be used.

        :param alignmentfile:  Input alignment file in fasta format
        :param speciestree:  A newick formatted species tree.
        :param workdir:  Directory of alignment file and species tree.
                         (Default value = '')
        """
        self.ete3paml_log = LogIt().default(logname="ete3paml", logfile=None)
        self.alignmentfile = alignmentfile
        self.speciestree = speciestree
        self.workdir = workdir

        # Import your species tree
        self._speciestree = Tree(self.speciestree, format=1)
        # TODO import organisms list

        # Import alignment file as string
        alignment_file = open(self.alignmentfile, 'r')
        alignment_str = alignment_file.read()
        self.aln_str = alignment_str
        alignment_file.close()
Exemplo n.º 14
0
    def __init__(self, repo=None, user=None, project=None, basic_project=False, website=None, db_repo="databases",
                 output_dir=os.getcwd(), recipes=CookBook()):
        """Deploy custom cookiecutter templates:

        The Oven uses the different Ingredients (parameters/attributes) and
        the Cook Book(cookiecutter templates) to bake_the_cookies
        in the Oven(class methods).

        After the cookies cool, they are put in the cookie_jar (output directory).

        :param repo (string):  An ingredient representing the repository name.
        :param user (string):  An ingredient representing the user name
        :param project (string):  An ingredient representing the project name.
        :param basic_project (bool):  A secret ingredient ONLY for the basic project cookie.
        :param db_config_file (list):  An ingredient representing a list of db_config_file.
        :param website (string):  An ingredient representing the website name.
        :param output_dir (path or pathlike):  The cookie jar for storing the cookies.
        :param recipes (pathlike):  An index for the different recipe templates.
        """
        self.cookielog = LogIt().default(logname="Cookies", logfile=None)
        self.cookie_jar = output_dir
        # Below are the PyPi path strings
        #    The first group is to access the cookiecutter templates
        self.repo = repo
        self.user = user
        self.project = project
        self.basic_project = basic_project
        self.website = website
        self.db_repo = db_repo
        self.Recipes = recipes
        self.Ingredients = {"repo": self.repo,
                            "user": self.user,
                            "project": self.project,
                            "basic_project": self.basic_project,
                            "website": self.website,
                            "db_repo": self.db_repo,
                            "recipes": self.Recipes.__dict__}
Exemplo n.º 15
0
class FilteredTree(object):
    """This is a  wrapper around the IQTree wrapper to get the best tree."""

    def __init__(self, alignment, dataType='CODON', working_dir=''):
        """Run IQTree to generate a "filtered" tree or best tree.

        :param alignment: Path to multiple sequence alignment file.
        :param dataType:  Input datatype. (Default value = 'CODON')
        :param working_dir: Path of working directory.  (Default value = '')
        """
        self.iqtree_log = LogIt().default(logname="iqtree", logfile=None)
        self.working_dir = Path(working_dir)
        self.iqtree_path = self.working_dir / Path('IQTREE')
        self.tree_file = self.iqtree_path / Path(alignment + '.treefile')
        self.gene = alignment.replace('_P2N_na.iqtree.aln', '')

        self.aln_File = str(self.working_dir / Path(alignment))
        outDir = self.working_dir / Path('IQTREE')
        makedirectory(outDir)
        copy(self.aln_File, str(outDir))
        os.chdir(str(outDir))
        self.iqtree_best_tree(alignment, dataType)
        treepath = str(self.working_dir / Path(self.gene + '_iqtree.nwk'))
        copy(self.tree_file, treepath)

    def iqtree_best_tree(self, alignment, dataType):
        """Generate and save the best tree from IQTree.

        :param alignment:  Path to multiple sequence alignment file.
        :param dataType:
        :return:
        """

        iqtree_cline = IQTreeCommandline(alignment=alignment,
                                         dataType=dataType)
        self.iqtree_log.info(iqtree_cline)
        check_call([str(iqtree_cline)], stderr=STDOUT, shell=True)
Exemplo n.º 16
0
    def __init__(self,
                 project,
                 email,
                 driver,
                 project_path=None,
                 proj_mana=ProjectManagement):
        """This is the base class for managing various databases.  It provides functionality for downloading and creating
        various databases for your pipeline.  There are functions available for downloading files from NCBI (BLAST,
        windowmasker, taxonomy, refseq release), downloading ITIS taxonomy tables, creating BioSQL databases, and
        uploading refseq release files to BioSQL databases.  This class currently REQUIRES an instance of
        ProjectManagement to be used with the proj_mana parameter.

        :param project: The name of the project.
        :type project: str.
        :param email: The email of the user for using during the FTP.
        :type email: str.
        :param driver: The driver used for creating the BioSQL databases.
        :type driver:  str.
        :param project_path: A path used for standalone/basic project configuration.
        :type project_path: str.
        :param proj_mana: A configuration variable for connecting projects.
        :type proj_mana: ProjectManagement.
        """

        self.db_mana_log = LogIt().default(logname="DatabaseManagement",
                                           logfile=None)
        self.project = project
        self.email = email
        self.driver = driver
        self.database_dict = {}
        self.ncbiftp = NcbiFTPClient(email=self.email)
        self.biosql = biosql
        self.proj_mana = proj_mana

        # Configuration of class attributes for Project Management.
        if proj_mana:
            add_self = attribute_config(self,
                                        composer=proj_mana,
                                        checker=ProjectManagement,
                                        project=project,
                                        project_path=project_path)
            for var, attr in add_self.__dict__.items():
                setattr(self, var, attr)
            self.database_path = self.user_db
        else:
            self.database_path = Path(project_path)
Exemplo n.º 17
0
    def __init__(self, phyml_input, datatype='aa'):
        """Run phyml to generate tree results.

        If you're using Linux, ensure that your phyml path is set in your bash
        profile. If you're using Windows, this function will look for the name
        of the executable 'PhyML-3.1_win32.exe'.
        """
        self.phyml_log = LogIt().default(logname="GenBank", logfile=None)

        # Use the phyml executable file
        phyml_exe = None

        # This is mainly intended for windows use or use with an executable
        # file
        win32 = "win32"
        executable = "PhyML-3.1_win32.exe"
        exe_name = executable if sys.platform == win32 else "phyml"
        phyml_exe = exe_name
        self.phyml_exe = phyml_exe
        self.datatype = datatype
        self.phyml_input = phyml_input
        self._runphyml()
Exemplo n.º 18
0
import time
from datetime import datetime
# import shutil
# import pkg_resources
from importlib import import_module
from multiprocessing.pool import ThreadPool
from pathlib import Path
import pandas as pd
import platform
from warnings import warn

from OrthoEvol.Tools.logit import LogIt
from OrthoEvol import OrthoEvolDeprecationWarning
from OrthoEvol.Tools.otherutils import runcmd

blastutils_log = LogIt().default(logname="blast-utils", logfile=None)
seqidlist_log = LogIt().default(logname="gi-lists", logfile=None)

_datefmt = '%I:%M:%S %p on %m-%d-%Y'
_date = str(datetime.now().strftime(_datefmt))


def map_func(hit):
    """Use the map function for formatting hit id's.

    :param hit: ID of the blast hit.
    :return:
    """

    hit.id1 = hit.id.split('|')[3]  # accession number
    hit.id2 = hit.id.split('|')[1]  # gi number
Exemplo n.º 19
0
def archive(database_path, archive_path, option, delete_flag=False):
    """Archive a database directory from a Cookie templated directory structure.

    This utility creates a YAML config dictionary that contains path-like
    objects for archiving.  The original data
    can be moved to the archive path or deleted all together.

    :param database_path:  A path to a folder that consists of the desired data.
    :param archive_path:  A path to an output folder for archived data.
    :param option:  An option for the archiving strategy.  Will be one of the
                    keys in the archive_options.
    :param delete_flag:  A flag for deleting the original data.  USE WITH CAUTION.
    :return:  Returns a list of paths to the *.tar.xz archive of the data
              and/or a path to the original data.

    """

    archive_dict = {}
    archive_list = []
    archive_log = LogIt().default(logname="Archive", logfile=None)

    if option == "Full":
        full_path = Path(database_path) / archive_options["Full"]
        for folder in os.listdir(str(full_path)):
            if os.path.isdir(folder):
                archive_dict[folder] = database_path / Path(folder)
    elif isinstance(option, list):
        for opt in option:
            other_path = Path(database_path) / archive_options[opt]
            archive_dict[opt] = other_path
    else:
        other_path = Path(database_path) / archive_options[option]
        archive_dict[option] = other_path

    for arch_name, data_path in archive_dict.items():
        root_dir = str(data_path.parent)
        base_dir = str(data_path.stem)
        d = datetime.datetime.now().strftime(fmt="%Y-%m-%d_%H%M")
        output_pathname = archive_path / Path(arch_name + "." + d)
        # Archive the desired data.
        data_size = get_size(start_path=str(data_path))
        archive_log.info("Archiving %s of data." % data_size)
        archive_filename = shutil.make_archive(base_name=str(output_pathname),
                                               format="xztar",
                                               root_dir=root_dir,
                                               base_dir=base_dir)
        archive_size = get_size(archive_filename)
        archive_log.warning("A %s archive file was created at %s." %
                            (archive_filename, archive_size))
        # TODO-ROB:  Logging.  And log to a README.md file.
        # Delete the files if desired.
        if delete_flag:
            archive_log.critical(
                "The original data will be deleted recursively at %s." %
                data_path)
            from OrthoEvol import OrthoEvolWarning
            OrthoEvolWarning(
                "You're about to delete your database (%s).  Are you sure??" %
                data_path)
            shutil.rmtree(path=data_path)
            archive_list.append(str(archive_filename))
        else:
            archive_log.critical(
                "The original data will be moved recursively from %s to %s." %
                (data_path, output_pathname))
            output_pathname.mkdir()
            shutil.move(src=str(data_path), dst=str(output_pathname))
            shutil.move(src=str(archive_filename), dst=str(output_pathname))
            archive_list.append(str(output_pathname))

        Path(data_path).mkdir(parents=True, exist_ok=True)
    return archive_list
Exemplo n.º 20
0
class ETE3PAML(object):
    """Use ETE3's M1 model to run PAML's codeml for orthology inference."""
    def __init__(self, alignmentfile, speciestree, workdir=''):
        """Initialize main variables/files to be used.

        :param alignmentfile:  Input alignment file in fasta format
        :param speciestree:  A newick formatted species tree.
        :param workdir:  Directory of alignment file and species tree.
                         (Default value = '')
        """
        self.ete3paml_log = LogIt().default(logname="ete3paml", logfile=None)
        self.alignmentfile = alignmentfile
        self.speciestree = speciestree
        self.workdir = workdir

        # Import your species tree
        self._speciestree = Tree(self.speciestree, format=1)
        # TODO import organisms list

        # Import alignment file as string
        alignment_file = open(self.alignmentfile, 'r')
        alignment_str = alignment_file.read()
        self.aln_str = alignment_str
        alignment_file.close()

    def prune_tree(self, organisms):
        """Prune branches for species not in the alignment file.

        Keep branches in the species tree for species in the alignment file
        Some species may not be present in the alignment file due to lack of
        matching with blast or simply the gene not being in the genome.

        :param organisms: A list of organisms in the alignment file.
        """

        if os.path.isfile(organisms):
            organismslist = csvtolist(organisms)
        else:
            organismslist = organisms

        branches2keep = []
        for organism in organismslist:
            if organism in self.aln_str:
                branches2keep.append(organism)
            else:
                self.ete3paml_log.warning('No sequence for %s.' % organism)

            self._speciestree.prune(branches2keep, preserve_branch_length=True)

            # Write the tree to a file
            self._speciestree.write(
                outfile=os.path.join(self.workdir, 'temptree.nw'))
            self.ete3paml_log.info('temptree.nw was created.')

    def run(self, pamlsrc, output_folder, model='M1'):
        """Run PAML using ETE.

        The default model is M1 as it is best for orthology inference in
        our case. You can use models `M2`, `M0`, `M3`.

        Ensure that you have the correct path to your codeml binary. It should
        be in the paml `/bin`.

        :param pamlsrc: Path to the codemly binary.
        :param output_folder: The name of the output folder.
        :param model: The model to be used. (Default value = 'M1')
        """

        # Import the newick tree
        tree = EvolTree('temptree.nw')

        # Import the alignment
        tree.link_to_alignment(self.alignmentfile)

        tree.workdir = self.workdir

        # Set the binpath of the codeml binary
        tree.execpath = pamlsrc
        # Run the model M1, M2, M3, or M0
        model_path = model + '.' + output_folder
        tree.run_model(model_path)
        self.ete3paml_log.info('Codeml is generating data in %s.' % model_path)
Exemplo n.º 21
0
class Management(object):
    def __init__(self, repo=None, home=os.getcwd(), new_repo=False, **kwargs):
        """Base class for directory management.

        It maps the directories of the OrthoEvol-Script package using the
        pathlib module, and turns the names of each important directory into
        a pathlike object.  The base class gives the option of creating a new
        repository with cookiecutter.

        :param repo (string): The name of the new repository to be created.
        :param home (path or path-like): The home of the file calling this name.
                                        When creating a new repository it is
                                        best to explicitly name the home path.
        :param new_repo (bool): Creates a new repository."""

        self.repo = repo
        self.file_home = Path(home)  # Home of the file calling this class
        self.managementlog = LogIt().default(logname="Management",
                                             logfile=None)

        # Below are path-like attributes that map various modules and directories.
        # Cookies Module:
        self.Kitchen = Oven(repo=self.repo, output_dir=self.file_home)
        self.Pantry = self.Kitchen.Recipes
        # Manager Module:
        self.Manager = Path(
            pkg_resources.resource_filename(Manager.__name__, ''))
        self.BioSQL = self.Manager / Path('BioSQL')
        self.SQLite3 = self.BioSQL / Path('sqlite')
        self.MySQL = self.BioSQL / Path('mysql')
        self.config = self.Manager / Path('config')
        # Orthologs Module:
        self.Orthologs = Path(
            pkg_resources.resource_filename(Orthologs.__name__, ''))
        self.Align = self.Orthologs / Path('Align')
        self.Blast = self.Orthologs / Path('Blast')
        self.GenBank = self.Orthologs / Path('GenBank')
        self.Phylogenetics = self.Orthologs / Path('Phylogenetics')
        # Tools Module:
        self.Tools = Path(pkg_resources.resource_filename(Tools.__name__, ''))
        self.ftp = self.Tools / Path('ftp')
        self.logit = self.Tools / Path('logit')
        self.mpi = self.Tools / Path('mpi')
        self.mygene = self.Tools / Path('mygene')
        self.pandoc = self.Tools / Path('pandoc')
        self.parallel = self.Tools / Path('parallel')
        self.pybasher = self.Tools / Path('pybasher')
        self.send2server = self.Tools / Path('send2server')
        self.sge = self.Tools / Path('sge')
        self.slackify = self.Tools / Path('slackify')
        self.otherutils = self.Tools / Path('otherutils')

        if self.repo:
            self.repo_path = self.file_home / Path(self.repo)
        self.managementlog.info(
            'The BaseManagement class variables have been set.')

        # Make a new repository.
        if new_repo is True:
            self.managementlog.info(
                'The repository cookie is being prepared for the Oven.')
            self.Kitchen.bake_the_repo()
Exemplo n.º 22
0
class BaseSGEJob(object):
    """Base class for simple jobs."""
    def __init__(self, base_jobname, config=None):
        """Initialize job attributes."""
        self.base_jobname = base_jobname
        if not config:
            self.default_job_attributes = __DEFAULT__
        else:
            self.default_job_attributes = config
        self.file2str = file2str
        self.sgejob_log = LogIt().default(logname="SGE JOB", logfile=None)
        self.pbsworkdir = os.getcwd()

        # Import the temp.pbs file using pkg_resources
        self.temp_pbs = resource_filename(templates.__name__, "temp.pbs")

    @classmethod
    def _configure(cls, length, base_jobname):
        """Configure job attributes or set it up.

        :param length:
        :param base_jobname:
        """

        baseid, base = basejobids(length, base_jobname)
        return baseid, base

    def debug(self, code):
        """Debug the SGEJob.

        :param code:
        """

        raise NotImplementedError

    def _cleanup(self, jobname):
        """Clean up job scripts.

        :param jobname: The name of the job being run or to be run.
        """

        self.sgejob_log.warning('Your job will now be cleaned up.')
        os.remove(jobname + '.pbs')
        self.sgejob_log.warning('%s.pbs has been deleted.', jobname)
        os.remove(jobname + '.py')
        self.sgejob_log.warning('%s.py has been deleted.' % jobname)

    def wait_on_job_completion(self, job_id):
        """Use Qstat to monitor your job.

        :param job_id: The job id to be monitored.
        """

        # TODO Allow either slack notifications or email or text.
        qwatch = Qstat().watch(job_id)
        if qwatch == 'Job id not found.':
            self.sgejob_log.info('%s has finished.' % job_id)
            sleep(30)
        elif qwatch == 'Waiting for %s to start running.' % job_id:
            self.sgejob_log.info('%s is queued to run.' % job_id)
            self.sgejob_log.info('Waiting for %s to start.' % job_id)
            sleep(30)
            self.wait_on_job_completion(job_id)
        elif qwatch == 'Waiting for %s to finish running.' % job_id:
            self.sgejob_log.info('%s is running.' % job_id)
            self.sgejob_log.info('Waiting for %s to finish.' % job_id)
            sleep(30)
            self.wait_on_job_completion(job_id)
        else:
            self.wait_on_job_completion(job_id)

    def submitjob(self, cleanup, wait=True):
        """Submit a job using qsub.
        
        :param cleanup: (Default value = False)
        :param wait: (Default value = True)
        """
        try:
            cmd = ['qsub ' + self.jobname + '.pbs']  # this is the command
            # Shell MUST be True
            cmd_status = run(cmd, stdout=PIPE, stderr=PIPE, shell=True, check=True)
        except CalledProcessError as err:
            self.sgejob_log.error(err.stderr.decode('utf-8'))
            if cleanup:
                self._cleanup(self.jobname)
        else:
            if cmd_status.returncode == 0:  # Command was successful.
                # The cmd_status has stdout that must be decoded.
                # When a qsub job is submitted, the stdout is the job id.
                submitted_jobid = cmd_status.stdout.decode('utf-8')
                self.sgejob_log.info(self.jobname + ' was submitted.')
                self.sgejob_log.info('Your job id is: %s' % submitted_jobid)
                if wait is True:
                    self.wait_on_job_completion(submitted_jobid)
                    self._cleanup(self.jobname)

            else:  # Unsuccessful. Stdout will be '1'
                self.sgejob_log.error('PBS job not submitted.')
Exemplo n.º 23
0
class MyGene(object):
    """Import a csv of refseq accessions & get gene information from mygene."""
    def __init__(self, infile, outfile):
        """Initialize my gene handle and refseq/accessions list.

        Get the basic gene information. It's best to use a csv file and title
        the row of the accessions list `Accessions`.

        :param infile: Input a csv file with a `Homo_sapiens` column
        :param outfile: Desired path & name of the output file.
        """
        self.infile = infile
        self.outfile = outfile

        self.mygene_log = LogIt().default('mygene', None)

        self.mg = mygene.MyGeneInfo()  # Set up mygene handle
        self.accessions_list = self._import_accfile()  # Create accessions list

        self.fields = 'symbol,name,entrezgene,summary'  # Default fields
        self.species = 'human'  # Species to use.

    def _import_accfile(self):
        """Import the accession file and make Homo_Sapiens column a list.

:return: Returns a list of accessions."""

        accfile = pd.read_csv(self.infile)
        acclist = list(
            [accession.upper() for accession in accfile.Homo_Sapiens])
        return acclist

    def query_mygene(self):
        """Query mygene for gene information."""

        self.mygene_log.info('You are querying: %s' % self.accessions_list)
        basic_info = self.mg.querymany(self.accessions_list,
                                       scopes='refseq',
                                       fields=self.fields,
                                       species=self.species,
                                       returnall=True,
                                       as_dataframe=True,
                                       size=1,
                                       verbose=True)

        # basic_info['out'] is the output dataframe.
        # Use basic_info.keys() to find dict keys
        # Reset the index on the dataframe so that each column is on the same
        # level
        basic_info['out'].reset_index(level=0, inplace=True)
        data = basic_info['out']
        gene_info = pd.DataFrame(data)
        gene_info.drop(gene_info.columns[[0, 1, 2]], axis=1, inplace=True)
        gene_info.rename(columns={
            'symbol': 'Gene Symbol',
            'entrezgene': 'Entrez ID',
            'name': 'Gene Name',
            'summary': 'Summary'
        },
                         inplace=True)

        # Create the NCBI links using a for loop
        baseurl = 'https://www.ncbi.nlm.nih.gov/gene/'

        # Create an empty list that can be appended to
        urllist = []

        # Create a for loop that creates the url using the Entrez ID
        # This loop also appends the url to a list and turns it into a link
        for entrezid in gene_info['Entrez ID']:
            entrezid = int(entrezid)
            url = baseurl + str(entrezid)
            # Important step
            # Format the url so that it becomes a hyperlink
            url = '<a href="{0}">{0}</a>'.format(url)
            urllist.append(url)

        # Turn the ncbiurls list into a dataframe using pandas
        ncbiurls = pd.DataFrame(urllist, columns=['NCBI Link'], dtype=str)

        # List of dataframes I want to combine
        frames = [gene_info, ncbiurls]

        # Merge the dataframes into 1 dataframe
        alldata = pd.concat(frames, axis=1)

        # Save the merged dataframes to a file
        alldata.to_csv(self.outfile, index=False)
        self.mygene_log.info('%s has been created.' % str(self.outfile))
Exemplo n.º 24
0
class Oven(object):
    """Class that deploys cookiecutter templates."""


    def __init__(self, repo=None, user=None, project=None, basic_project=False, website=None, db_repo="databases",
                 output_dir=os.getcwd(), recipes=CookBook()):
        """Deploy custom cookiecutter templates:

        The Oven uses the different Ingredients (parameters/attributes) and
        the Cook Book(cookiecutter templates) to bake_the_cookies
        in the Oven(class methods).

        After the cookies cool, they are put in the cookie_jar (output directory).

        :param repo (string):  An ingredient representing the repository name.
        :param user (string):  An ingredient representing the user name
        :param project (string):  An ingredient representing the project name.
        :param basic_project (bool):  A secret ingredient ONLY for the basic project cookie.
        :param db_config_file (list):  An ingredient representing a list of db_config_file.
        :param website (string):  An ingredient representing the website name.
        :param output_dir (path or pathlike):  The cookie jar for storing the cookies.
        :param recipes (pathlike):  An index for the different recipe templates.
        """
        self.cookielog = LogIt().default(logname="Cookies", logfile=None)
        self.cookie_jar = output_dir
        # Below are the PyPi path strings
        #    The first group is to access the cookiecutter templates
        self.repo = repo
        self.user = user
        self.project = project
        self.basic_project = basic_project
        self.website = website
        self.db_repo = db_repo
        self.Recipes = recipes
        self.Ingredients = {"repo": self.repo,
                            "user": self.user,
                            "project": self.project,
                            "basic_project": self.basic_project,
                            "website": self.website,
                            "db_repo": self.db_repo,
                            "recipes": self.Recipes.__dict__}

    def bake_the_repo(self, cookie_jar=None):
        """Create a new repository.

        This function creates a new repository.  If a repository name
        is given to the class then it is given a name.  If not, cookiecutters
        takes input from the user.

        The base class will be the only class that allows cookiecutters parameter
        no_input to be False.

        :param cookie_jar:  (Default value = None)
        """

        self.cookielog.warn('Creating directories from the Repository Cookie template.')
        if cookie_jar:
            self.cookie_jar = cookie_jar
        if self.repo:
            no_input = True
            e_c = {
                "repository_name": self.repo
            }
        else:
            no_input = False
            e_c = None
            # TODO-ROB change cookiecutter so that it can take pathlike objects
        cookiecutter(str(self.Recipes.repo_cookie), no_input=no_input,
                     extra_context=e_c, output_dir=str(self.cookie_jar))
        os.chmod(str(self.cookie_jar / Path(self.repo)), mode=0o777)
        self.cookielog.info('Repository directories have been created. ✔')

    def bake_the_user(self, cookie_jar=None):
        """Create a new directory system for the active user.

        This function uses the username given by our FLASK framework
        and creates a new directory system for the active user using
        our  new_user cookiecutter template.

        :param cookie_jar:  (Default value = None)
        """

        self.cookielog.warn('Creating directories from the User Cookie template.')
        if cookie_jar:
            self.cookie_jar = cookie_jar

        # This is used ONLY when the user registers in flask
        # TODO-ROB:  Create the cookiecutter.json file

        # extra_context overrides user and default configs
        cookiecutter(str(self.Recipes.user_cookie), no_input=True, extra_context={
            "user_name": self.user}, output_dir=str(self.cookie_jar))

        # Change user permissions with flask later (this is for testing
        # purposes
        os.chmod(str(self.cookie_jar / Path(self.user)), mode=0o777)
        self.cookielog.info('Directories have been created for the user, %s. ✔' % self.user)

    def bake_the_project(self, cookie_jar=None):
        """Create a new project in the user's directory.

        :param cookie_jar:  (Default value = None)
        """

        self.cookielog.warn('Creating directories from the Project Cookie template.')
        if cookie_jar:
            self.cookie_jar = cookie_jar
        # Add the project
        if self.project:
            no_input = True
            e_c = {"project_name": self.project}
            project_log_message = "(%s)" % self.project
        else:
            no_input = False
            e_c = None
            project_log_message = "that has been named with user input"

        if not self.basic_project:
            self.cookielog.warn('A project linked to a user/repository is being created.')
            cookiecutter(str(self.Recipes.project_cookie), extra_context=e_c, no_input=no_input,
                         output_dir=str(self.cookie_jar))
            # Logging
            if self.user:
                self.cookielog.info('Directories have been created for %s\'s project %s. ✔' % (self.user, project_log_message))
            else:
                self.cookielog.info('Directories have been created for %s.' % project_log_message)
        else:
            self.cookielog.warn('A basic standalone project is being created.')
            cookiecutter(str(self.Recipes.basic_project_cookie), extra_context=e_c, no_input=no_input,
                         output_dir=str(self.cookie_jar))
            self.cookielog.info('Directories have been created for a standalone project %s. ✔' % project_log_message)
        os.chmod(str(self.cookie_jar / Path(self.project)), mode=0o777)

    def bake_the_db_repo(self, db_config_file, db_path, cookie_jar=None, archive_flag=False, delete=False):
        """Create a database directory.

        :param db_config_file:
        :param db_path:
        :param cookie_jar:  (Default value = None)
        :param archive_flag:  (Default value = False)
        :param delete:  (Default value = False)
        :return: A new database inside the users database directory
        """

        # TODO-ROB:  Work work this in with the database management class.
        if cookie_jar:
            self.cookie_jar = cookie_jar
        # TODO-ROB:  Rework this for new archive function.
        # if archive_flag:
        #     archive_list = archive(database_path=db_path, archive_path=self.cookie_jar, config_file=db_config_file, delete_flag=delete)
        #     for arch in archive_list:
        #         self.cookielog.info("An archive has been created at %s." % arch)
        # else:
        #     if self.db_repo:
        #         no_input = True
        #         e_c = {"db_name": self.db_repo}
        #     else:
        #         no_input = False
        #         e_c = None

            cookiecutter(str(self.Recipes.db_cookie), extra_context=e_c, no_input=no_input, output_dir=str(self.cookie_jar))
            self.cookielog.info("Directories have been created for a database repository %s." %
                                str((self.cookie_jar / Path(self.db_repo))))
            os.chmod(str(self.cookie_jar / Path(self.db_repo)), mode=0o777)
            #
            # for db_key, db_value in db_config_dict["Database_Config"].items():
            #     if db_value:
            #         pass
            # TODO-ROB:  Use db_value system with database management configuration.

    def bake_the_website(self, host, port, website_path, cookie_jar=None):
        """Create a website using the new_website cookie.

        After creating the directory structure, the run_script function
        from cookiecutter finds the hooks folder which contains a
        post-cookiecutter-template-generation bash script.  The bash script
        sets up the proper dependencies and environment variables for the
        website, and runs the website on the specified host and port.

        :param host:
        :param port:
        :param website_path:
        :param cookie_jar:  (Default value = None)
        """

        self.cookielog.warn('Creating directories from the Website Cookie template.')
        if cookie_jar:
            self.cookie_jar = cookie_jar
        # TODO-ROB:  Add heavy logging here
        e_c = {"website_name": self.website,
               "website_path": os.path.join(str(website_path), ''),
               "website_host": host,
               "website_port": port}
        cookiecutter(str(self.Recipes.website_cookie), no_input=True,
                     extra_context=e_c, output_dir=str(self.cookie_jar))
        os.chmod(str(self.cookie_jar / Path(self.website)), mode=0o777)
        # Get the absolute path to the script that starts the flask server
        script_path = website_path / \
                      Path('hooks') / Path('post_gen_project.sh')
        #scripts_file_path = find_hook('post_gen_project.sh', hooks_dir=str(script_path))
        # TODO-ROB add screening to the bash script for flask run -h -p
        run_script(script_path=str(script_path), cwd=str(website_path))
        self.cookielog.info('Directories have been created for the Flask Web Server, %s. ✔' % self.website)
        self.cookielog.warn('The %s Flask Server should now be running on http://%s:%s' % (self.website, host, port))

    def bake_the_research(self, research_type, research, cookie_jar=None):
        """Create a directory for a new research project.

        :param research_type:
        :param research:
        :param cookie_jar:  (Default value = None)
        """

        self.cookielog.warn('Creating directories from the Research Cookie template.')
        if cookie_jar:
            self.cookie_jar = cookie_jar

        e_c = {"research_type": research_type,
               "research_name": research}
        cookiecutter(str(self.Recipes.research_cookie), no_input=True,
                     extra_context=e_c, output_dir=str(self.cookie_jar))
        os.chmod(str(self.cookie_jar / Path(research_type)), mode=0o777)
        # script_path = self.project_cookie / Path('hooks') / Path('post_gen_project.py')
        # run_script(script_path, )
        self.cookielog.info('Directories have been created for the %s research project, %s. ✔' % (research_type, research))

    def bake_the_app(self, app, cookie_jar=None):
        """Create an app.

        :param app:  Name of the app.
        :param cookie_jar:  (Default value = None)
        """

        self.cookielog.warn('Creating directories from the App Cookie template.')
        if cookie_jar:
            self.cookie_jar = cookie_jar
        e_c = {"app_name": app}
        cookiecutter(str(self.Recipes.app_cookie), no_input=True,
                     extra_context=e_c, output_dir=str(self.cookie_jar))
        os.chmod(str(self.cookie_jar), mode=0o777)
        self.cookielog.info("Directories have been created for an R-Shiny app, %s. ✔" % app)
Exemplo n.º 25
0
class GenBank(object):
    """This class will handle GenBank files in various ways."""
    def __init__(self,
                 project,
                 project_path=None,
                 solo=False,
                 multi=True,
                 archive=False,
                 min_fasta=True,
                 blast=OrthoBlastN,
                 **kwargs):
        """Handle GenBank files in various ways.

        It allows for refseq-release .gbff files to be downloaded from NCBI
        and uploaded to a BioSQL database (biopython).  Single .gbk files can be
        downloaded from the .gbff, and uploaded to a custom BopSQL database for
        faster acquisition of GenBank data.

        :param project:  The name of the project.
        :param project_path: The relative path to the project.
        :param solo:  A flag for adding single fasta files.
        :param multi:  A flag for adding multi-fasta files.
        :param archive: A flag for archiving current GenBank Data.  # TODO
        :param min_fasta: A flag for minimizing FASTA file headers.
        :param blast:  The blast parameter is used for composing various
                       Orthologs.Blast classes.  Can be a class, a dict,
                       or none.
        :returns:  .gbff files/databases, .gbk files/databases, & FASTA files.
        """

        # TODO-ROB: Change the way the file systems work.
        self.project = project
        self.project_path = project_path
        self.solo = solo
        self.multi = multi
        self.min_fasta = min_fasta
        self.genbanklog = LogIt().default(logname="GenBank", logfile=None)

        # Configuration of class attributes
        add_self = attribute_config(self,
                                    composer=blast,
                                    checker=OrthoBlastN,
                                    checker2=BaseComparativeGenetics,
                                    project=project,
                                    project_path=project_path)
        for var, attr in add_self.__dict__.items():
            setattr(self, var, attr)

        # Configuration
        # FIXME AttributeError: 'GenBank' object has no attribute 'user_db'
        self.target_gbk_db_path = self.user_db / Path(self.project)
        Path.mkdir(self.target_gbk_db_path, parents=True, exist_ok=True)

        # Make a list of BioSQL database(.db) files that contain GenBank info
        self.db_files_list = []
        for FILE in os.listdir(str(self.ncbi_db_repo)):
            if FILE.endswith('.db'):
                self.db_files_list.append(str(FILE))

    @staticmethod
    def name_fasta_file(path, gene, org, feat_type, feat_type_rank, extension,
                        mode):
        """Name a fasta file.

        Provide a uniquely named FASTA file:
        * Coding sequence:
            * Single - "<path>/<gene>_<organism><feat_type_rank>.<extension>"
            * Multi  - "<path>/<gene><feat_type_rank>.<extension>"
        * Other:
            * Single - "<path>/<gene>_<organism>_<feat_type_rank>.<extension>"
            * Multi  - "<path>/<gene>_<feat_type_rank>.<extension>"

        :param path:  The path where the file will be made.
        :param gene:  The gene name.
        :param org:  The organism name.
        :param feat_type:  The type of feature from the GenBank record.
                           (CDS, UTR, misc_feature, variation, etc.)
        :param feat_type_rank:  The feature type  + the rank.
                                (There can be multiple misc_features and
                                 variations)
        :param extension:  The file extension.
                           (".ffn", ".faa", ".fna", ".fasta")
        :param mode:  The mode ("w" or "a") for writing the file.  Write to a
                      solo-FASTA file.  Append a multi-FASTA file.
        :return:  The uniquely named FASTA file.
        """

        # Create path variables.  (typically raw_data/<gene>/GENBANK
        feat_path = path
        # Create a format-able string for file names
        if feat_type_rank is "CDS":
            single = '%s_%s%s%s'
            multi = '%s%s%s'
        else:
            single = '%s_%s_%s%s'
            multi = '%s_%s%s'
        # Create different names based on fasta file type
        if mode == 'w':
            file_path = feat_path / Path(
                single % (gene, org, feat_type_rank, extension))
        elif mode == 'a':
            file_path = feat_path / Path(multi %
                                         (gene, feat_type_rank, extension))

        # Make the base directory and return an open file.
        makedirectory(feat_path)
        file = open(file_path, mode)
        return file

    @staticmethod
    def protein_gi_fetch(feature):
        """Retrieve the protein gi number.

        :param feature:  Search the protein feature for the GI number.
        :return:  The protein GI number as a string.
        """

        # Find the protein gi number under the features qualifiers.
        for x in feature.qualifiers:
            if 'GI' in x:
                _, _, p_gi = x.partition(':')
                return p_gi

    def create_post_blast_gbk_records(self, org_list, gene_dict):
        """Create a single GenBank file for each ortholog.

        After a blast has completed and the accession numbers have been compiled
        into an accession file, this class searches a local NCBI refseq release
        database composed of GenBank records.  This method will create a single
        GenBank file (.gbk) for each ortholog with an accession number.
        The create_post_blast_gbk_records is only callable if the
        the instance is composed by one of the Blast classes.  This method also
        requires an NCBI refseq release database to be set up with the proper
        GenBank Flat Files (.gbff) files.

        :param org_list:  List of organisms
        :param gene_dict:  A nested dictionary for accessing accession numbers.
                           (e.g. gene_dict[GENE][ORGANISM} yields an accession
                           number)
        :return:  Does not return an object, but creates genbank files.
        """

        # Parse the tier_frame_dict to get the tier
        for G_KEY, _ in self.tier_frame_dict.items():
            tier = G_KEY
            # Parse the tier based transformed dataframe to get the gene
            for GENE in self.tier_frame_dict[tier].T:
                # Parse the organism list to get the desired accession number
                for ORGANISM in org_list:
                    accession = str(gene_dict[GENE][ORGANISM])
                    accession, _, version = accession.partition('.')
                    # When parsing a GenBank database, the version needs to be removed.
                    accession = accession.upper()
                    server_flag = False
                    # Search the databases and create a GenBank file.
                    self.get_gbk_file(accession,
                                      GENE,
                                      ORGANISM,
                                      server_flag=server_flag)

    def get_gbk_file(self, accession, gene, organism, server_flag=None):
        """Search a GenBank database for a target accession number.

        This function searches through the given NCBI databases (created by
        uploading NCBI refseq .gbff files to a BioPython BioSQL database) and
        creates single GenBank files.  This function can be used after a
        blast or on its own.  If used on it's own then the NCBI .db files must
        be manually moved to the proper directories.

        :param accession: Accession number of interest without the version.
        :param gene: Target gene of the accession number parameter.
        :param organism: Target organism of the accession number parameter.
        :param server_flag:  (Default value = None)
        :return:
        """

        gene_path = self.raw_data / Path(gene) / Path('GENBANK')
        Path.mkdir(gene_path, parents=True, exist_ok=True)

        # Parse each database to find the proper GenBank record
        for FILE in self.db_files_list:
            db_file_path = self.ncbi_db_repo / Path(FILE)
            # Stop searching if the GenBank record has been created.
            if server_flag is True:
                break
            server = BioSeqDatabase.open_database(driver='sqlite3',
                                                  db=str(db_file_path))
            # Parse the sub-databases
            for SUB_DB_NAME in server.keys():
                db = server[SUB_DB_NAME]
                try:
                    record = db.lookup(accession=accession)
                    gbk_file = '%s_%s.gbk' % (gene, organism)
                    gbk_file_path = gene_path / Path(gbk_file)
                    with open(gbk_file_path, 'w') as GB_file:
                        GB_file.write(record.format('genbank'))
                        self.genbanklog.info(GB_file.name, 'created')
                    # Make sure we have the correct GenBank file.
                    self.gbk_quality_control(gbk_file_path, gene, organism)
                    # Stop searching if the GenBank record has been created.
                    server_flag = True
                    break
                except IndexError:
                    self.genbanklog.critical(
                        'Index Error in %s.  Moving to the next database...' %
                        SUB_DB_NAME)
                    continue

        # If the file has not been created after searching, then raise an error
        if server_flag is not True:
            self.genbanklog.critical(
                "The GenBank file was not created for %s (%s, %s)." %
                (accession, gene, organism))
            raise FileNotFoundError

    def gbk_quality_control(self, gbk_file, gene, organism):
        """Ensures the quality or validity of the retrieved genbank record.

    It takes the GenBank record and check to make sure the Gene and Organism
        from the GenBank record match the Gene and Organism from the accession
        file.  If not, then the Blast has returned the wrong accession number.

        :param gbk_file:  The path to a GenBank file.
        :param gene:  A gene name from the Accession file.
        :param organism:  A gene name from the Accession file.
        :return:
        """

        # TODO-ROB:  Check the bad data here against the misssing/duplicate files
        record = SeqIO.read(gbk_file, 'genbank')
        gene_flag = False
        organism_flag = False
        accession = record.id
        self.gbk_gene_synonym = {}
        self.duplicated_dict["validated"] = {}
        # Get the organism name from the GenBank file
        gbk_organism = record.features[0].qualifiers[
            "organism"]  # A list with one entry
        if len(gbk_organism) == 1:
            gbk_organism = gbk_organism[0]
            gbk_organism = gbk_organism.replace(" ", "_")
        else:
            self.genbanklog.critical(
                "Two organisms exist in the GenBank file.  Is this normal?")
            raise BrokenPipeError

        # Check to make sure the organism in the GenBank file matches the
        # organism from the accession file
        if gbk_organism == organism:
            self.genbanklog.info(
                "The GenBank organism, %s, has been verified for %s." %
                (organism, gene))
        else:
            organism_flag = True

        # Get the gene from the GenBank files
        gbk_genes = record.features[1].qualifiers["gene"]
        # Get the synonyms from the GenBank file if they exist and add them to
        # the list.
        if "gene_synonym" in str(record.features[1].qualifiers.keys()):
            base_gene_name = gbk_genes
            gbk_genes.extend(record.features[1].qualifiers["gene_synonym"])
            # Create a dictionary from the synonyms
            self.gbk_gene_synonym[base_gene_name] = []
            self.gbk_gene_synonym[base_gene_name].extend(gbk_genes)

        # Check to make sure the gene in the GenBank file matches the gene from
        # the accession file
        for gbk_gene in gbk_genes:
            if gbk_gene == gene:
                gene_flag = False
                self.genbanklog.info(
                    "The GenBank gene, %s, has been verified for %s." %
                    (gene, organism))
                break
            else:
                gene_flag = True

        # TODO-ROB:  Add a verified key to the duplicates dictionary.
        # Raise errors.
        if organism_flag is True and gene_flag is True:
            self.genbanklog.critical(
                "The organisms don't match.\n\tGenBank: %s \n\tAccession File: %s"
                % (gbk_organism, organism))
            self.genbanklog.critical(
                "The genes don't match. \n\tGenBank: %s \n\tAccession File: %s"
                % (gbk_genes, gene))
            raise BrokenPipeError
        elif organism_flag is True:
            self.genbanklog.critical(
                "The organisms don't match.\n\tGenBank: %s \n\tAccession File: %s"
                % (gbk_organism, organism))
            raise BrokenPipeError
        elif gene_flag is True:
            self.genbanklog.critical(
                "The genes don't match. \n\tGenBank: %s \n\tAccession File: %s"
                % (gbk_genes, gene))
            raise BrokenPipeError

        self.duplicated_dict["validated"][accession] = [gene, organism]

    def gbk_upload(self):
        """Upload a BioSQL database with target GenBank data (.gbk files).

        This method is only usable after creating GenBank records with this
        class.  It uploads a BioSQL databases with target GenBank data (.gbk
        files).  This creates a compact set of data for each project.

        :return:  Does not return an object.
        """

        t_count = 0
        # Parse the tier dictionary
        for TIER in self.tier_frame_dict.keys():
            db_name = str(TIER) + '.db'
            db_file_path = self.target_gbk_db_path / Path(db_name)
            # Create the db file if it exists
            if os.path.isfile(str(db_file_path)) is False:
                self.genbanklog.warn(
                    'Copying Template BioSQL Database...  This may take a few minutes...'
                )
                shutil.copy2('Template_BioSQL_DB.db', str(db_file_path))

            # If it already exists then the database is bad, or needs to be update.  Delete it.
            else:
                # TODO-ROB: This part is broken until the template db creation and management is added
                os.remove(str(db_file_path))
                self.genbanklog.warn(
                    'Copying Template BioSQL Database...  This may take a few minutes...'
                )
                shutil.copy2('Template_BioSQL_DB.db', str(db_file_path))

            server = BioSeqDatabase.open_database(driver='sqlite3',
                                                  db=str(db_file_path))
            gene_path = self.raw_data
            # Parse the raw_data folder to get the name of each gene.
            for GENE in os.listdir(str(gene_path)):
                sub_db_name = GENE
                genbank_path = gene_path / Path(GENE) / Path('GENBANK')
                # Parse the GenBank file names for each gene in order to upload them to a custom BioSQL database
                for FILE in os.listdir(str(genbank_path)):
                    # Try to load the database.
                    try:
                        if sub_db_name not in server.keys():
                            server.new_database(sub_db_name)
                        db = server[sub_db_name]
                        count = db.load(SeqIO.parse(FILE, 'genbank'))
                        server.commit()
                        self.genbanklog.info('Server Commited %s' %
                                             sub_db_name)
                        self.genbanklog.info('%s database loaded with %s.' %
                                             (db.dbid, FILE))
                        self.genbanklog.info(
                            "That file contains %s genbank records." %
                            str(count))
                        t_count = t_count + count
                        self.genbanklog.info(
                            'The total number of files loaded so far is %i.' %
                            t_count)
                    # If the database cannot be loaded then rollback the server and raise an error.
                    except BaseException:
                        server.rollback()
                        # Try to delete the sub database and commit
                        try:
                            del server[sub_db_name]
                            server.commit()
                        # If it cannot be deleted then raise an error.
                        except BaseException:
                            raise
                        raise

    def get_fasta_files(self, acc_dict, db=True):
        """Create FASTA files for each GenBank record in the accession dictionary.

        It can search through a BioSQL database or it can crawl a directory
        for .gbk files.

        :param acc_dict:  An accession dictionary like the one created by
                          CompGenObjects.
        :param db:  A flag that determines whether or not to use the custom
                    BioSQL database or to use .gbk files.
                    (Default value = True)
        :return:  Returns FASTA files for each GenBank record.
        """

        # Get FASTA files from the BioSQL GenBank databases.
        if db is True:
            # Parse the directory that contains the databases for the project of interest.
            for database in os.listdir(str(self.target_gbk_db_path)):
                server = BioSeqDatabase.open_database(driver="sqlite3",
                                                      db=database)
                try:
                    for db_name in server.keys():
                        db = server[db_name]
                        # For each GenBank record in the database write a set of FASTA files.
                        for item in db.keys():
                            record = db.lookup(item)
                            self.write_fasta_files(record, acc_dict)
                            self.genbanklog.info(
                                "FASTA files for %s created from BioSQL database."
                                % item)
                except:
                    raise ()
        # Get FASTA files from the GenBank files.
        # TODO-ROB change this.  Broken by new directory structure
        # TODO-ROB directory looks like /raw_data/Gene_1/GENBANK/*.gbk
        elif db is False:
            # Parse the directory that contain the GenBank records for the project of interest.
            for _, _, gbk_files in os.walk(str(self.target_gbk_files_path)):
                # For each genbank record write a set of FASTA files.
                for gbk_file in gbk_files:
                    if Path(gbk_file).suffix is '.gbk':
                        record = SeqIO.read(gbk_file, 'genbank')
                        self.write_fasta_files(record, acc_dict)
                        self.genbanklog.info("FASTA files for %s created." %
                                             gbk_file)

    def write_fasta_files(self, record, acc_dict):
        """Create a dictionary for formatting the FASTA header & sequence.

        :param record:  A GenBank record created by BioPython.
        :param acc_dict:  Accession dictionary from the CompGenObjects class.
        :return:
        """

        feat_type_list = []
        for feature in record.features:
            # XXX Set up variables to use for dictionary values !!!
            # Basic variables.
            accession = record.id
            gene = acc_dict[accession][0]
            organism = acc_dict[accession][1]
            # Variable for minimalistic FASTA files.
            genus, sep, species = organism.partition('_')
            min_org = str(''.join([genus[0], sep, species[0:28]]))

            # Keep a list of feature types to identify duplicates (for naming the FASTA files).
            # The first iteration of the feature type contains no number.
            # The following iterations are concatenated with a number.
            feat_type = str(feature.type)
            feat_type_list.append(feat_type)
            duplicate_num = feat_type_list.count(feat_type)
            if duplicate_num == 1:
                feat_type_rank = feat_type
            else:
                feat_type_rank = feat_type + str(duplicate_num)
            # XXX END !!!

            # TODO-ROB:  Remove the GI number stuff here or at least prepare for
            # file with no GI.
            # Create a dictionary and format FASTA file entries.
            fmt = {
                'na_gi': str(record.annotations['gi']),
                'aa_gi': str(self.protein_gi_fetch(feature)),
                'na_acc_n': str(accession),
                'aa_acc_n': str(feature.qualifiers['protein_id'][0]),
                'na_description': str(record.description),
                'aa_description': str(feature.qualifiers['product'][0]),
                'na_seq': str(feature.extract(record.seq)),
                'aa_seq': str(feature.qualifiers['translation'][0]),
                'na_misc_feat': str(feature.qualifiers['note'][0]),
                'org': str(organism),
                'gene': str(gene),
                'min_org': str(min_org),
                'feat_type': str(feat_type),
                'feat_type_rank': str(feat_type_rank),
                'path': str(self.raw_data / Path(gene) / Path('GENBANK'))
            }
            # Set up minimalistic FASTA headers and sequence entries for Nucleic Acid and Amino Acid sequences.
            na_entry = ">{min_org}\n{na_seq}\n".format(**fmt)
            aa_entry = ">{min_org}\n{aa_seq}\n".format(**fmt)
            # For full FASTA headers/sequences set min_fasta to False
            if self.min_fasta is False:
                na_entry = ">gi|{na_gi}|ref|{na_acc_n}| {na_description}\n{na_seq}\n".format(
                    **fmt)
                aa_entry = ">gi|{aa_gi}|reg|{aa_acc_n}| {aa_description} {org}\n{aa_seq}\n".format(
                    **fmt)
            # ######### End ######### #

            # ############ Write desired FASTA files ############ #
            if self.solo is True:
                self.solo_fasta(na_entry, aa_entry, fmt)
            if self.multi is True:
                self.multi_fasta(na_entry, aa_entry, fmt)

    def solo_fasta(self, na_entry, aa_entry, fmt):
        """This method writes a sequence of a feature to a uniquely named file using a dictionary for formatting.

        :param na_entry:  A string representing the Nucleic Acid sequence data in FASTA format.
        :param aa_entry:  A string representing the Amino Acid sequence data in FASTA format.
        :param fmt:  A dictionary for formatting the FASTA entries and the file names.
        :return:  Does not return an object, but creates single entry FASTA files.
        """

        mode = 'w'

        # Create the desired variables from the formatter dictionary.
        feat_type = fmt['feat_type']
        feat_type_rank = fmt['feat_type_rank']
        path = fmt['path']
        gene = fmt['gene']
        org = fmt['org']

        if feat_type == "CDS":
            # Create a .ffn file (FASTA for Coding Nucleic Acids)
            extension = '.ffn'
            file = self.name_fasta_file(path, gene, org, feat_type,
                                        feat_type_rank, extension, mode)
            file.write(na_entry)
            file.close()
            # Create a .faa file (FASTA for Amino Acids)
            extension = '.faa'
            file = self.name_fasta_file(path, gene, org, 'Protein',
                                        feat_type_rank, extension, mode)
            file.write(aa_entry)
            file.close()

        elif feat_type == "misc_feature":
            # Create a custom entry for miscellaneous features.
            na_entry = ">gi|{na_gi}|ref|{na_acc_n}| {na_description} Feature: {na_misc_feat}\n{na_seq}\n".format(
                **fmt)
            # Creates .fna files (generic FASTA file for Nucleic Acids)
            extension = '.fna'
            file = self.name_fasta_file(path, gene, org, feat_type,
                                        feat_type_rank, extension, mode)
            file.write(na_entry)
            file.close()

        elif feat_type != "variation":
            # Creates .fasta files (generic FASTA file)
            extension = '.fasta'
            file = self.name_fasta_file(path, gene, org, 'Other',
                                        feat_type_rank, extension, mode)
            file.write(na_entry)
            file.close()

    def multi_fasta(self, na_entry, aa_entry, fmt):
        """Append an othologous sequence of a feature to a uniquely named file.

        Usese a dictionary for formatting.

        :param na_entry:  A string representing the Nucleic Acid sequence data in FASTA format.
        :param aa_entry:  A string representing the Amino Acid sequence data in FASTA format.
        :param fmt:  A dictionary for formatting the FASTA entries and the file names.
        :return:  Does not return an object, but creates or appends to a multi entry FASTA file.
        """

        mode = 'a'

        # Create the desired variables from the formatter dictionary.
        feat_type = fmt['feat_type']
        feat_type_rank = fmt['feat_type_rank']
        path = fmt['path']
        gene = fmt['gene']
        org = fmt['org']

        if feat_type == "CDS":
            # Create a MASTER .ffn file (multi-FASTA file for Coding Nucleic Acids)
            extension = '.ffn'
            file = self.name_fasta_file(path, gene, org, feat_type,
                                        feat_type_rank, extension, mode)
            file.write(na_entry)
            file.close()
            # Create a MASTER .faa file (multi-FASTA file for Amino Acids)
            extension = '.faa'
            file = self.name_fasta_file(path, gene, org, feat_type,
                                        feat_type_rank, extension, mode)
            file.write(aa_entry)
            file.close()
        elif feat_type == "misc_feature":
            na_entry = ">gi|{na_gi}|ref|{na_acc_n}| {na_description} Feature: {na_misc_feat}\n{na_seq}\n".format(
                **fmt)
            # Creates .fna files (generic FASTA file for Nucleic Acids)
            extension = '.fna'
            file = self.name_fasta_file(path, gene, org, feat_type,
                                        feat_type_rank, extension, mode)
            file.write(na_entry)
            file.close()