def __init__(self, source, dest=None): # Source and destination # self.source = FASTQ(source) self.dest = DirectoryPath(dest) # Default case # if dest is None: self.dest = DirectoryPath(self.source.prefix_path + '.fastqc')
def __init__(self, path, empty=False): # Super # DirectoryPath.__init__(self, path) # The git directory # self.git_dir = self.path + '.git' # Check exists # if not empty and not self: raise Exception("No git repository at '%s'" % (self.git_dir)) # Default arguments # self.default = ["--git-dir=" + self.git_dir, "--work-tree=" + self.path]
def __init__(self, command, language='python', base_dir=os.path.abspath(os.getcwd()), modules=None, **kwargs): # Check the modules variable is a list # if modules is None: self.modules = [] elif not isinstance(modules, list): self.modules = list(modules) else: self.modules = modules # Check command type # if not isinstance(command, list): command = [command] # Log directory # for i in range(30): now = datetime.datetime.now(dateutil.tz.tzlocal()) log_name = now.strftime("%Y-%m-%da%Hh%Mm%Ss%Z%z") base_dir = DirectoryPath(base_dir + log_name + '/') if not base_dir.exists: base_dir.create() break else: time.sleep(2) continue else: base_dir.create() # Modules directory # modules_dir = DirectoryPath(base_dir + "modules/") modules_dir.create() # The script to be sent # script = [] # Copy modules to the log directory # for module in self.modules: module_dir = os.path.dirname(module.__file__) module_name = module.__name__ repos_dir = GitRepo(os.path.abspath(module_dir + '/../')) project_name = os.path.basename(repos_dir) static_module_dir = modules_dir + project_name + '/' module_version = module.__version__ + ' ' + repos_dir.tag # Copy # print("Making static copy of module '%s' for SLURM job..." % module_name) sh.cp('-R', repos_dir, static_module_dir) # Make script # script.insert(0, "sys.path.insert(0, '%s')" % static_module_dir) script += ["import %s" % module_name] script += [ "print 'Using static copy of module %s version %s'" % (module_name, module_version) ] # Prepend to the script to be sent # script.insert(0, "import os, sys") # Add the user's command to the script # script += command # Super # JobSLURM.__init__(self, script, language, base_dir, **kwargs)
def __init__(self, seq_type=None, base_dir=None): # The sequence type is either 'prot' or 'nucl' # self.seq_type = seq_type # The default base directory # if base_dir is None: base_dir = os.environ.get('HOME', '/') + '/' # Make base_dir object # self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.base_dir = DirectoryPath(self.base_dir) # Make autopaths object # self.autopaths = AutoPaths(self.base_dir, self.all_paths)
def __init__(self, path, empty=False): # Super # DirectoryPath.__init__(self, path) # The git directory # self.git_dir = self.path + '.git' # Check exists # if not empty and not self: raise Exception("No git repository at '%s'" % self.git_dir) # Default arguments # self.default = [ "--git-dir=" + self.git_dir, "--work-tree=" + self.path ]
def symlink_single_aidb(self): """ During development, and for testing purposes we have a single AIDB that all countries can share and that is found in another repository. """ # The path to the SQLite3 file # source = DirectoryPath(aidb_repo + 'aidb.db') # Check it exists # try: assert source except AssertionError: msg = "The sqlite3 database at '%s' does not seems to exist." raise AssertionError(msg % source) # Symlink # destin = self.paths.aidb source.link_to(destin)
def __init__(self, data_dir=None): # The directory that contains all databases # if data_dir is None: data_dir = home + 'databases/' # Base directory for paths # self.base_dir = DirectoryPath(data_dir + self.short_name + '/') self.autopaths = AutoPaths(self.base_dir, self.all_paths) # Location of zip file remotely # self.ref_url = self.base_url + "gg_13_8_99.refalign.tgz" self.tax_url = self.base_url + "gg_13_8_99.taxonomy.tgz" # Location of zip file locally # self.ref_dest = self.autopaths.alignment self.tax_dest = self.autopaths.taxonomy # The results after download # self.alignment = self.base_dir + "gg_13_8_99.refalign" self.taxonomy = self.base_dir + "gg_13_8_99.gg.tax" # Make them FilePaths objects # self.alignment = FilePath(self.alignment) self.taxonomy = FilePath(self.taxonomy)
def __init__(self, continent, data_dir=None): """Store the data directory paths where everything will start from.""" # Parent # self.continent = continent # Main directory # self.data_dir = DirectoryPath(data_dir) # Set country codes # self.set_codes() # Store the reference years # self.set_years()
def set_paths(self, base_dir, script_path): """Set the directory, the script path and the outfile path""" # Make absolute paths # if 'change_dir' in self.kwargs: self.kwargs['change_dir'] = DirectoryPath( os.path.abspath(self.kwargs['change_dir'])) if 'out_file' in self.kwargs: self.kwargs['out_file'] = FilePath( os.path.abspath(self.kwargs['out_file'])) # In case there is a base directory # if base_dir is not None: self.base_dir = DirectoryPath(os.path.abspath(base_dir)) self.script_path = FilePath(base_dir + "run." + self.extensions[self.language]) self.kwargs['change_dir'] = base_dir self.kwargs['out_file'] = FilePath(base_dir + "run.out") # Other cases # if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path()) if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path))
def __init__(self, fam_name): self.fam_name = fam_name self.base_dir = DirectoryPath(pfam.autopaths.specific_dir + self.fam_name) self.p = AutoPaths(self.base_dir, self.all_paths)
Written by Lucas Sinclair and Paul Rougieux. JRC Biomass Project. Unit D1 Bioeconomy. """ # Built-in modules # import os # First party modules # from autopaths.dir_path import DirectoryPath from autopaths.auto_paths import AutoPaths from plumbing.cache import property_cached # Where is the data, default case # aidb_repo = DirectoryPath("~/repos/libcbm_aidb/") # But you can override that with an environment variable # if os.environ.get("LIBCBM_AIDB"): aidb_repo = DirectoryPath(os.environ['LIBCBM_AIDB']) ############################################################################### class AIDB(object): """ This class will provide access to the archive index database also called 'cbm_defaults' in libcbm. It is an SQLite3 database that weighs approx 18 MiB. To symlink the single test database to all countries do the following: >>> from libcbm_runner.core.continent import continent
class GreengenesMothur(Database): """ This is the Greengenes database, in its specific version from mothur. Seen at: https://mothur.org/wiki/greengenes-formatted_databases/ To install: >>> from seqsearch.databases.mothur.greengenes import gg_mothur >>> gg_mothur.download() >>> gg_mothur.unzip() It will place the results in `~/databases/gg_mothur/`. This database is from 2013. """ nickname = "gg" tag = "greengenes" short_name = "gg_mothur" long_name = "The Greengenes v13_8_99 database (mothur version)" version = "13_8_99" base_url = "https://mothur.s3.us-east-2.amazonaws.com/wiki/" all_paths = """ /gg_alignment.tgz /gg_taxonomy.tgz """ @property def rank_names(self): """ The names of the taxonomic rank at each level. There are a total of 7 ranks. """ return [ 'Domain', # 0 'Phylum', # 1 'Class', # 2 'Order', # 3 'Family', # 4 'Genus', # 5 'Species' ] # 6 def __init__(self, data_dir=None): # The directory that contains all databases # if data_dir is None: data_dir = home + 'databases/' # Base directory for paths # self.base_dir = DirectoryPath(data_dir + self.short_name + '/') self.autopaths = AutoPaths(self.base_dir, self.all_paths) # Location of zip file remotely # self.ref_url = self.base_url + "gg_13_8_99.refalign.tgz" self.tax_url = self.base_url + "gg_13_8_99.taxonomy.tgz" # Location of zip file locally # self.ref_dest = self.autopaths.alignment self.tax_dest = self.autopaths.taxonomy # The results after download # self.alignment = self.base_dir + "gg_13_8_99.refalign" self.taxonomy = self.base_dir + "gg_13_8_99.gg.tax" # Make them FilePaths objects # self.alignment = FilePath(self.alignment) self.taxonomy = FilePath(self.taxonomy) def download(self): # Make sure the directory exists # self.base_dir.create(safe=True) # Remove previous downloads # self.ref_dest.remove() self.tax_dest.remove() # Message # print("\n Downloading '%s'" % self.ref_url) # Download # import wget wget.download(self.ref_url, out=self.ref_dest.path) # Message # print("\n Downloading '%s'" % self.tax_url) # Download # wget.download(self.tax_url, out=self.tax_dest.path) def unzip(self): # Message # print("\n Extracting archive '%s'" % self.ref_dest) # Uncompress # archive = tarfile.open(self.ref_dest, 'r:gz') archive.extractall(self.base_dir) # Message # print("\n Extracting archive '%s'" % self.tax_dest) # Uncompress # archive = tarfile.open(self.tax_dest, 'r:gz') archive.extractall(self.base_dir) def __bool__(self): """ Return True if the silva database was already downloaded and the results are stored on the filesystem. Return False otherwise. """ return self.taxonomy.exists and self.alignment.exists
__version__ = '0.2.2' # Built-in modules # import os, sys # First party modules # from autopaths import Path from autopaths.dir_path import DirectoryPath from plumbing.git import GitRepo # Constants # project_name = 'libcbm_runner' project_url = 'https://github.com/xapple/libcbm_runner' # Get paths to module # self = sys.modules[__name__] module_dir = Path(os.path.dirname(self.__file__)) # The repository directory # repos_dir = module_dir.directory # The module is maybe in a git repository # git_repo = GitRepo(repos_dir, empty=True) # Where is the data, default case # libcbm_data_dir = DirectoryPath("~/repos/libcbm_data/") # But you can override that with an environment variable # if os.environ.get("LIBCBM_DATA"): libcbm_data_dir = DirectoryPath(os.environ['LIBCBM_DATA'])
class Database: """General database object to inherit from.""" all_paths = """ /raw/ /unzipped/ /blast_db/ """ def __init__(self, seq_type=None, base_dir=None): # The sequence type is either 'prot' or 'nucl' # self.seq_type = seq_type # The default base directory # if base_dir is None: base_dir = os.environ.get('HOME', '/') + '/' # Make base_dir object # self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.base_dir = DirectoryPath(self.base_dir) # Make autopaths object # self.autopaths = AutoPaths(self.base_dir, self.all_paths) def __bool__(self): """ Return True if the database was already downloaded and the results are stored on the filesystem. Return False otherwise. """ return not self.autopaths.unzipped_dir.empty @property_cached def ftp(self): """If the data is to be obtained by FTP, here is the ftputil object.""" from ftputil import FTPHost ftp = FTPHost(self.ftp_url, "anonymous") ftp.chdir(self.ftp_dir) return ftp @property_cached def files_to_retrieve(self): """The files we want to download with their destinations.""" if hasattr(self, "pattern"): files = self.ftp.listdir(self.ftp.curdir) files.sort(key=natural_sort) return OrderedDict((f, FilePath(self.autopaths.raw_dir + f)) for f in files if fnmatch.fnmatch(f, self.pattern)) if hasattr(self, "files"): return OrderedDict( (f, FilePath(self.autopaths.raw_dir + f)) for f in self.files) @property def files_remaining(self): """The files we haven't downloaded yet based on size checks.""" return OrderedDict( (source, dest) for source, dest in self.files_to_retrieve.items() if dest.count_bytes != self.ftp.path.getsize(source)) def download(self): """Retrieve all files from the FTP site.""" # Create the directory # self.base_dir.create_if_not_exists() # Loop over files # for source, dest in tqdm(self.files_remaining.items()): dest.remove() self.ftp.download(source, dest) dest.permissions.only_readable() @property def raw_files(self): """The files we have downloaded.""" return map(FASTA, self.autopaths.raw_dir.contents) def ungzip(self): """Ungzip them.""" # Gzip # for f in tqdm(self.raw_files): destination = self.autopaths.unzipped_dir + f.prefix f.ungzip_to(destination) destination.permissions.only_readable() def untargz(self): """Untargzip them.""" # Gzip # for f in tqdm(self.raw_files): f.untargz_to(self.autopaths.unzipped_dir) for f in self.autopaths.unzipped_dir: f.permissions.only_readable() @property def sequences(self): """All the sequences from all the raw files.""" for fasta in self.raw_files: for seq in fasta: yield seq #------------------ Only for preformatted BLAST databases ----------------# @property_cached def blast_db(self): """A BLASTable version of the sequences.""" # Import # from seqsearch.search.blast import BLASTdb # Create object # db = BLASTdb(self.autopaths.unzipped_dir + self.db_name, self.seq_type) # Return # return db #--------------------- Only for taxonomic databases ----------------------# @property_cached def tax_depth_freq(self): def depths(): with open(self.taxonomy, 'r') as handle: for line in handle: line = line.strip('\n') otu_name, species = line.split('\t') yield len(species.split(';')) return Counter(depths())
def new_temp_dir(**kwargs): return DirectoryPath(tempfile.mkdtemp(**kwargs) + '/')