Пример #1
0
 def __init__(self, source, dest=None):
     # Source and destination #
     self.source = FASTQ(source)
     self.dest = DirectoryPath(dest)
     # Default case #
     if dest is None:
         self.dest = DirectoryPath(self.source.prefix_path + '.fastqc')
Пример #2
0
 def __init__(self, path, empty=False):
     # Super #
     DirectoryPath.__init__(self, path)
     # The git directory #
     self.git_dir = self.path + '.git'
     # Check exists #
     if not empty and not self:
         raise Exception("No git repository at '%s'" % (self.git_dir))
     # Default arguments #
     self.default = ["--git-dir=" + self.git_dir, "--work-tree=" + self.path]
Пример #3
0
 def __init__(self,
              command,
              language='python',
              base_dir=os.path.abspath(os.getcwd()),
              modules=None,
              **kwargs):
     # Check the modules variable is a list #
     if modules is None: self.modules = []
     elif not isinstance(modules, list): self.modules = list(modules)
     else: self.modules = modules
     # Check command type #
     if not isinstance(command, list): command = [command]
     # Log directory #
     for i in range(30):
         now = datetime.datetime.now(dateutil.tz.tzlocal())
         log_name = now.strftime("%Y-%m-%da%Hh%Mm%Ss%Z%z")
         base_dir = DirectoryPath(base_dir + log_name + '/')
         if not base_dir.exists:
             base_dir.create()
             break
         else:
             time.sleep(2)
             continue
     else:
         base_dir.create()
     # Modules directory #
     modules_dir = DirectoryPath(base_dir + "modules/")
     modules_dir.create()
     # The script to be sent #
     script = []
     # Copy modules to the log directory #
     for module in self.modules:
         module_dir = os.path.dirname(module.__file__)
         module_name = module.__name__
         repos_dir = GitRepo(os.path.abspath(module_dir + '/../'))
         project_name = os.path.basename(repos_dir)
         static_module_dir = modules_dir + project_name + '/'
         module_version = module.__version__ + ' ' + repos_dir.tag
         # Copy #
         print("Making static copy of module '%s' for SLURM job..." %
               module_name)
         sh.cp('-R', repos_dir, static_module_dir)
         # Make script #
         script.insert(0, "sys.path.insert(0, '%s')" % static_module_dir)
         script += ["import %s" % module_name]
         script += [
             "print 'Using static copy of module %s version %s'" %
             (module_name, module_version)
         ]
     # Prepend to the script to be sent #
     script.insert(0, "import os, sys")
     # Add the user's command to the script #
     script += command
     # Super #
     JobSLURM.__init__(self, script, language, base_dir, **kwargs)
Пример #4
0
 def __init__(self, seq_type=None, base_dir=None):
     # The sequence type is either 'prot' or 'nucl' #
     self.seq_type = seq_type
     # The default base directory #
     if base_dir is None:
         base_dir = os.environ.get('HOME', '/') + '/'
     # Make base_dir object #
     self.base_dir = base_dir + 'databases/' + self.short_name + '/'
     self.base_dir = DirectoryPath(self.base_dir)
     # Make autopaths object #
     self.autopaths = AutoPaths(self.base_dir, self.all_paths)
Пример #5
0
 def __init__(self, path, empty=False):
     # Super #
     DirectoryPath.__init__(self, path)
     # The git directory #
     self.git_dir = self.path + '.git'
     # Check exists #
     if not empty and not self:
         raise Exception("No git repository at '%s'" % self.git_dir)
     # Default arguments #
     self.default = [
         "--git-dir=" + self.git_dir, "--work-tree=" + self.path
     ]
Пример #6
0
 def symlink_single_aidb(self):
     """
     During development, and for testing purposes we have a single AIDB
     that all countries can share and that is found in another repository.
     """
     # The path to the SQLite3 file #
     source = DirectoryPath(aidb_repo + 'aidb.db')
     # Check it exists #
     try:
         assert source
     except AssertionError:
         msg = "The sqlite3 database at '%s' does not seems to exist."
         raise AssertionError(msg % source)
     # Symlink #
     destin = self.paths.aidb
     source.link_to(destin)
Пример #7
0
 def __init__(self, data_dir=None):
     # The directory that contains all databases #
     if data_dir is None: data_dir = home + 'databases/'
     # Base directory for paths #
     self.base_dir = DirectoryPath(data_dir + self.short_name + '/')
     self.autopaths = AutoPaths(self.base_dir, self.all_paths)
     # Location of zip file remotely #
     self.ref_url = self.base_url + "gg_13_8_99.refalign.tgz"
     self.tax_url = self.base_url + "gg_13_8_99.taxonomy.tgz"
     # Location of zip file locally #
     self.ref_dest = self.autopaths.alignment
     self.tax_dest = self.autopaths.taxonomy
     # The results after download #
     self.alignment = self.base_dir + "gg_13_8_99.refalign"
     self.taxonomy = self.base_dir + "gg_13_8_99.gg.tax"
     # Make them FilePaths objects #
     self.alignment = FilePath(self.alignment)
     self.taxonomy = FilePath(self.taxonomy)
Пример #8
0
 def __init__(self, continent, data_dir=None):
     """Store the data directory paths where everything will start from."""
     # Parent #
     self.continent = continent
     # Main directory #
     self.data_dir = DirectoryPath(data_dir)
     # Set country codes #
     self.set_codes()
     # Store the reference years #
     self.set_years()
Пример #9
0
 def set_paths(self, base_dir, script_path):
     """Set the directory, the script path and the outfile path"""
     # Make absolute paths #
     if 'change_dir' in self.kwargs:
         self.kwargs['change_dir'] = DirectoryPath(
             os.path.abspath(self.kwargs['change_dir']))
     if 'out_file' in self.kwargs:
         self.kwargs['out_file'] = FilePath(
             os.path.abspath(self.kwargs['out_file']))
     # In case there is a base directory #
     if base_dir is not None:
         self.base_dir = DirectoryPath(os.path.abspath(base_dir))
         self.script_path = FilePath(base_dir + "run." +
                                     self.extensions[self.language])
         self.kwargs['change_dir'] = base_dir
         self.kwargs['out_file'] = FilePath(base_dir + "run.out")
     # Other cases #
     if base_dir is None and script_path is None:
         self.script_path = FilePath(new_temp_path())
     if script_path is not None:
         self.script_path = FilePath(os.path.abspath(script_path))
Пример #10
0
 def __init__(self, fam_name):
     self.fam_name = fam_name
     self.base_dir = DirectoryPath(pfam.autopaths.specific_dir +
                                   self.fam_name)
     self.p = AutoPaths(self.base_dir, self.all_paths)
Пример #11
0
Written by Lucas Sinclair and Paul Rougieux.

JRC Biomass Project.
Unit D1 Bioeconomy.
"""

# Built-in modules #
import os

# First party modules #
from autopaths.dir_path   import DirectoryPath
from autopaths.auto_paths import AutoPaths
from plumbing.cache       import property_cached

# Where is the data, default case #
aidb_repo = DirectoryPath("~/repos/libcbm_aidb/")

# But you can override that with an environment variable #
if os.environ.get("LIBCBM_AIDB"):
    aidb_repo = DirectoryPath(os.environ['LIBCBM_AIDB'])

###############################################################################
class AIDB(object):
    """
    This class will provide access to the archive index database
    also called 'cbm_defaults' in libcbm.
    It is an SQLite3 database that weighs approx 18 MiB.

    To symlink the single test database to all countries do the following:

        >>> from libcbm_runner.core.continent import continent
Пример #12
0
class GreengenesMothur(Database):
    """
    This is the Greengenes database, in its specific version from mothur.
    Seen at:

    https://mothur.org/wiki/greengenes-formatted_databases/

    To install:

        >>> from seqsearch.databases.mothur.greengenes import gg_mothur
        >>> gg_mothur.download()
        >>> gg_mothur.unzip()

    It will place the results in `~/databases/gg_mothur/`.

    This database is from 2013.
    """

    nickname = "gg"
    tag = "greengenes"
    short_name = "gg_mothur"
    long_name = "The Greengenes v13_8_99 database (mothur version)"
    version = "13_8_99"
    base_url = "https://mothur.s3.us-east-2.amazonaws.com/wiki/"

    all_paths = """
                /gg_alignment.tgz
                /gg_taxonomy.tgz
                """

    @property
    def rank_names(self):
        """
        The names of the taxonomic rank at each level.
        There are a total of 7 ranks.
        """
        return [
            'Domain',  # 0
            'Phylum',  # 1
            'Class',  # 2
            'Order',  # 3
            'Family',  # 4
            'Genus',  # 5
            'Species'
        ]  # 6

    def __init__(self, data_dir=None):
        # The directory that contains all databases #
        if data_dir is None: data_dir = home + 'databases/'
        # Base directory for paths #
        self.base_dir = DirectoryPath(data_dir + self.short_name + '/')
        self.autopaths = AutoPaths(self.base_dir, self.all_paths)
        # Location of zip file remotely #
        self.ref_url = self.base_url + "gg_13_8_99.refalign.tgz"
        self.tax_url = self.base_url + "gg_13_8_99.taxonomy.tgz"
        # Location of zip file locally #
        self.ref_dest = self.autopaths.alignment
        self.tax_dest = self.autopaths.taxonomy
        # The results after download #
        self.alignment = self.base_dir + "gg_13_8_99.refalign"
        self.taxonomy = self.base_dir + "gg_13_8_99.gg.tax"
        # Make them FilePaths objects #
        self.alignment = FilePath(self.alignment)
        self.taxonomy = FilePath(self.taxonomy)

    def download(self):
        # Make sure the directory exists #
        self.base_dir.create(safe=True)
        # Remove previous downloads #
        self.ref_dest.remove()
        self.tax_dest.remove()
        # Message #
        print("\n Downloading '%s'" % self.ref_url)
        # Download #
        import wget
        wget.download(self.ref_url, out=self.ref_dest.path)
        # Message #
        print("\n Downloading '%s'" % self.tax_url)
        # Download #
        wget.download(self.tax_url, out=self.tax_dest.path)

    def unzip(self):
        # Message #
        print("\n Extracting archive '%s'" % self.ref_dest)
        # Uncompress #
        archive = tarfile.open(self.ref_dest, 'r:gz')
        archive.extractall(self.base_dir)
        # Message #
        print("\n Extracting archive '%s'" % self.tax_dest)
        # Uncompress #
        archive = tarfile.open(self.tax_dest, 'r:gz')
        archive.extractall(self.base_dir)

    def __bool__(self):
        """
        Return True if the silva database was already downloaded and the
        results are stored on the filesystem. Return False otherwise.
        """
        return self.taxonomy.exists and self.alignment.exists
Пример #13
0
__version__ = '0.2.2'

# Built-in modules #
import os, sys

# First party modules #
from autopaths import Path
from autopaths.dir_path import DirectoryPath
from plumbing.git import GitRepo

# Constants #
project_name = 'libcbm_runner'
project_url  = 'https://github.com/xapple/libcbm_runner'

# Get paths to module #
self       = sys.modules[__name__]
module_dir = Path(os.path.dirname(self.__file__))

# The repository directory #
repos_dir = module_dir.directory

# The module is maybe in a git repository #
git_repo = GitRepo(repos_dir, empty=True)

# Where is the data, default case #
libcbm_data_dir = DirectoryPath("~/repos/libcbm_data/")

# But you can override that with an environment variable #
if os.environ.get("LIBCBM_DATA"):
    libcbm_data_dir = DirectoryPath(os.environ['LIBCBM_DATA'])
Пример #14
0
class Database:
    """General database object to inherit from."""

    all_paths = """
    /raw/
    /unzipped/
    /blast_db/
    """

    def __init__(self, seq_type=None, base_dir=None):
        # The sequence type is either 'prot' or 'nucl' #
        self.seq_type = seq_type
        # The default base directory #
        if base_dir is None:
            base_dir = os.environ.get('HOME', '/') + '/'
        # Make base_dir object #
        self.base_dir = base_dir + 'databases/' + self.short_name + '/'
        self.base_dir = DirectoryPath(self.base_dir)
        # Make autopaths object #
        self.autopaths = AutoPaths(self.base_dir, self.all_paths)

    def __bool__(self):
        """
        Return True if the database was already downloaded and the
        results are stored on the filesystem. Return False otherwise.
        """
        return not self.autopaths.unzipped_dir.empty

    @property_cached
    def ftp(self):
        """If the data is to be obtained by FTP, here is the ftputil object."""
        from ftputil import FTPHost
        ftp = FTPHost(self.ftp_url, "anonymous")
        ftp.chdir(self.ftp_dir)
        return ftp

    @property_cached
    def files_to_retrieve(self):
        """The files we want to download with their destinations."""
        if hasattr(self, "pattern"):
            files = self.ftp.listdir(self.ftp.curdir)
            files.sort(key=natural_sort)
            return OrderedDict((f, FilePath(self.autopaths.raw_dir + f))
                               for f in files
                               if fnmatch.fnmatch(f, self.pattern))
        if hasattr(self, "files"):
            return OrderedDict(
                (f, FilePath(self.autopaths.raw_dir + f)) for f in self.files)

    @property
    def files_remaining(self):
        """The files we haven't downloaded yet based on size checks."""
        return OrderedDict(
            (source, dest) for source, dest in self.files_to_retrieve.items()
            if dest.count_bytes != self.ftp.path.getsize(source))

    def download(self):
        """Retrieve all files from the FTP site."""
        # Create the directory #
        self.base_dir.create_if_not_exists()
        # Loop over files #
        for source, dest in tqdm(self.files_remaining.items()):
            dest.remove()
            self.ftp.download(source, dest)
            dest.permissions.only_readable()

    @property
    def raw_files(self):
        """The files we have downloaded."""
        return map(FASTA, self.autopaths.raw_dir.contents)

    def ungzip(self):
        """Ungzip them."""
        # Gzip #
        for f in tqdm(self.raw_files):
            destination = self.autopaths.unzipped_dir + f.prefix
            f.ungzip_to(destination)
            destination.permissions.only_readable()

    def untargz(self):
        """Untargzip them."""
        # Gzip #
        for f in tqdm(self.raw_files):
            f.untargz_to(self.autopaths.unzipped_dir)
        for f in self.autopaths.unzipped_dir:
            f.permissions.only_readable()

    @property
    def sequences(self):
        """All the sequences from all the raw files."""
        for fasta in self.raw_files:
            for seq in fasta:
                yield seq

    #------------------ Only for preformatted BLAST databases ----------------#
    @property_cached
    def blast_db(self):
        """A BLASTable version of the sequences."""
        # Import #
        from seqsearch.search.blast import BLASTdb
        # Create object #
        db = BLASTdb(self.autopaths.unzipped_dir + self.db_name, self.seq_type)
        # Return #
        return db

    #--------------------- Only for taxonomic databases ----------------------#
    @property_cached
    def tax_depth_freq(self):
        def depths():
            with open(self.taxonomy, 'r') as handle:
                for line in handle:
                    line = line.strip('\n')
                    otu_name, species = line.split('\t')
                    yield len(species.split(';'))

        return Counter(depths())
Пример #15
0
def new_temp_dir(**kwargs):
    return DirectoryPath(tempfile.mkdtemp(**kwargs) + '/')