def __init__( self, query_path, # The input sequences db_path=pfam.hmm_db, # The database to search seq_type='prot' or 'nucl', # The seq type of the query_path file e_value=0.001, # The search threshold params=None, # Add extra params for the command line out_path=None, # Where the results will be dropped executable=None, # If you want a specific binary give the path cpus=None): # The number of threads to use # Save attributes # self.query = FASTA(query_path) self.db = FilePath(db_path) self.params = params if params else {} self.e_value = e_value self.seq_type = seq_type self.executable = FilePath(executable) # Cores to use # if cpus is None: self.cpus = min(multiprocessing.cpu_count(), 32) else: self.cpus = cpus # Auto detect database short name # if db_path == 'pfam': self.db = pfam.hmm_db if db_path == 'tigrfam': self.db = tigrfam.hmm_db # Output # if out_path is None: self.out_path = FilePath(self.query.prefix_path + '.hmmout') elif out_path.endswith('/'): self.out_path = FilePath(out_path + self.query.prefix + '.hmmout') else: self.out_path = FilePath(out_path)
def files_to_retrieve(self): """The files we want to download with their destinations.""" result = OrderedDict() result[self.base_url + "protein.sequences.v9.1.fa.gz"] = FilePath( self.p.raw_proteins) result[self.base_url + "COG.mappings.v9.1.txt.gz"] = FilePath( self.p.raw_mappings) return result
def files_to_retrieve(self): """The files we want to download with their destinations.""" if hasattr(self, "pattern"): files = self.ftp.listdir(self.ftp.curdir) files.sort(key=natural_sort) return OrderedDict((f, FilePath(self.autopaths.raw_dir + f)) for f in files if fnmatch.fnmatch(f, self.pattern)) if hasattr(self, "files"): return OrderedDict( (f, FilePath(self.autopaths.raw_dir + f)) for f in self.files)
def __init__(self, data_dir=None): # The directory that contains all databases # if data_dir is None: data_dir = home + 'databases/' # Base directory for paths # self.base_dir = data_dir + self.short_name + '/' self.autopaths = AutoPaths(self.base_dir, self.all_paths) # Location of zip file remotely # self.url = self.base_url + self.base_name + ".tgz" # Location of zip file locally # self.dest = self.autopaths.tgz # The results after download # prefix = self.base_dir + self.base_name + '/' + self.base_name self.alignment = FilePath(prefix + ".fasta") self.taxonomy = FilePath(prefix + ".tax")
def check_setup_py(path_of_setup): """ Parses the required modules from a `setup.py` file and checks they are importable and have the minimum required version installed. Some ideas for extracting dependency information from a `setup.py` file: https://stackoverflow.com/questions/24236266/ Instead let's try the `parsesetup` package. Note: The code in the setup.py will be evaluated. Other interesting projects: https://pypi.org/project/requirements-parser/ Typically you can use this function like this: >>> from plumbing.dependencies import check_setup_py >>> check_setup_py('~/module_name/setup.py') """ # First let's check we have that module # check_module('parsesetup') import parsesetup # Parse it # from autopaths.file_path import FilePath path_of_setup = FilePath(path_of_setup) # Run it # setup_args = parsesetup.parse_setup(path_of_setup, trusted=True) requires = setup_args.get('install_requires', []) # Parse it # requires = [re.split(r'==|>=', req) for req in requires] requires = [req if len(req) == 2 else (req[0], None) for req in requires] requires = dict(requires) # Loop # for package, version in requires.items(): check_module(package, version)
def build_tree_raxml(self, new_path = None, seq_type = 'nucl' or 'prot', num_threads = None, free_cores = 2, keep_dir = False): """Make a tree with RAxML.""" # Check output # if new_path is None: new_path = self.prefix_path + '.tree' # What model to choose # if seq_type == 'nucl': model = "GTRGAMMA" if seq_type == 'prot': model = "PROTGAMMAJTTF" # Threads # if num_threads is None: num_threads = multiprocessing.cpu_count() - free_cores else: num_threads = int(num_threads) - free_cores num_threads = max(1, num_threads) # Run it # temp_dir = new_temp_dir() sh.raxml811('-m', model, "-T", num_threads, '-p', 1, '-s', self.path, '-n', 'tree', '-w', temp_dir, '-f', 'a', '-x', 1, '-N', 'autoMR') # Move into place # if keep_dir: shutil.rmtree(new_path) shutil.move(temp_dir, new_path) if not keep_dir: shutil.move(temp_dir + 'RAxML_bestTree.tree', new_path) # Return # return FilePath(new_path)
def set_defaults(self): """ This method will replace empty attributes with defaults when this is needed. """ # In case we got a special object, just use the blast_db attribute # if self.algorithm == 'blast' and hasattr(self.database, 'blast_db'): self.database = self.database.blast_db if self.algorithm == 'vsearch' and hasattr(self.database, 'vsearch_db'): self.database = self.database.vsearch_db # Otherwise in case we got a path, convert it to a BLASTdb # if self.algorithm == 'blast' and not isinstance( self.database, BLASTdb): self.database = BLASTdb(self.database) # The filtering options # if self.filtering is None: self.filtering = {} # Output path default value # if self.out_path is None: self.out_path = self.input_fasta.prefix_path + '.' + \ self.algorithm + 'out' # Output path setting # self.out_path = FilePath(self.out_path) # Number of cores default value # if self.num_threads is None or self.num_threads is True: self.num_threads = min(multiprocessing.cpu_count(), 32) # Extra params to be given to the search algorithm # if self.params is None: self.params = {}
def index_bowtie(self): """Create an index on the fasta file compatible with bowtie2.""" # It returns exit code 1 if the fasta is empty # assert self # Call the bowtie executable # sh.bowtie2_build(self.path, self.path) return FilePath(self.path + '.1.bt2')
def save(self, **kw): # Load # df = self.df.copy() # Modify the index name# if self.capital_index and df.index.name is not None: df.index.name = df.index.name.capitalize() # Modify column names # if self.upper_columns: df.columns = df.columns.str.upper() # Possibility to overwrite path # if 'path' in kw: path = FilePath(kw['path']) else: path = self.path # Special cases for float formatting # if self.float_format_tex == 'split_thousands': self.float_format_tex = self.split_thousands # Make sure the directory exists # self.base_dir.create_if_not_exists() # Latex version # if 'tex' in self.formats: df.to_latex(str(path), float_format = self.float_format_tex, na_rep = self.na_rep, index = self.index, bold_rows = self.bold_rows, column_format = self.column_format, escape = self.escape_tex) # CSV version (plain text) # if 'csv' in self.formats: path = path.replace_extension('csv') df.to_csv(str(path), float_format = self.float_format_csv, index = self.index) # Return the path # return path
def __init__(self, data_dir=None): # The directory that contains all databases # if data_dir is None: data_dir = home + 'databases/' # Base directory for paths # self.base_dir = DirectoryPath(data_dir + self.short_name + '/') self.autopaths = AutoPaths(self.base_dir, self.all_paths) # Location of zip file remotely # self.ref_url = self.base_url + "gg_13_8_99.refalign.tgz" self.tax_url = self.base_url + "gg_13_8_99.taxonomy.tgz" # Location of zip file locally # self.ref_dest = self.autopaths.alignment self.tax_dest = self.autopaths.taxonomy # The results after download # self.alignment = self.base_dir + "gg_13_8_99.refalign" self.taxonomy = self.base_dir + "gg_13_8_99.gg.tax" # Make them FilePaths objects # self.alignment = FilePath(self.alignment) self.taxonomy = FilePath(self.taxonomy)
def __init__(self, data_dir=None): # The directory that contains all databases # if data_dir is None: data_dir = home + 'databases/' # Base directory for paths # self.base_dir = data_dir + self.short_name + '/' self.autopaths = AutoPaths(self.base_dir, self.all_paths) # Location of zip file remotely # self.url = self.base_url + "silva.nr_v%s.tgz" % self.version # Location of zip file locally # self.dest = self.autopaths.tgz # The results after download # self.alignment = self.base_dir + "silva.nr_v%s.align" self.taxonomy = self.base_dir + "silva.nr_v%s.tax" # Make them FilePaths objects # self.alignment = FilePath(self.alignment % self.version) self.taxonomy = FilePath(self.taxonomy % self.version) # The part that mothur will use for naming files # self.nickname = "nr_v%s" % self.version
def to_qual(self, path, verbose=False): # Select verbosity # import tqdm wrapper = tqdm.tqdm if verbose else lambda x: x # Do it # with open(path, 'w') as handle: for r in wrapper(self): SeqIO.write(r, handle, 'qual') # Return # return FilePath(path)
def __init__(self, version, base_dir=None): # Attributes # self.version = version self.short_name = self.short_name + "_" + self.version # Base directory # if base_dir is None: base_dir = home self.base_dir = base_dir + 'databases/' + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # URL # self.url = self.base_url + self.version # The archive # self.dest = self.p.archive # The results # self.alignment = FilePath(self.base_dir + "pr_two.gb203_v%s.align" % self.version) self.taxonomy = FilePath(self.base_dir + "pr_two.gb203_v%s.tax" % self.version) # The part that mothur will use for naming files # self.nickname = "gb203_v%s" % self.version
def __init__(self, query_path, db_path, seq_type = 'prot' or 'nucl', # The seq type of the query_path file params = None, # Add extra params for the command line algorithm = "blastn" or "blastp", # Will be auto-determined with seq_type out_path = None, # Where the results will be dropped executable = None, # If you want a specific binary give the path cpus = None, # The number of threads to use num = None, # When parallelized, the number of this thread _out = None, # Store the stdout at this path _err = None): # Store the stderr at this path # Main input # self.query = FASTA(query_path) # The database to search against # self.db = FilePath(db_path) # Other attributes # self.seq_type = seq_type self.algorithm = algorithm self.num = num self.params = params if params else {} # The standard output and error # self._out = _out self._err = _err # Output defaults # if out_path is None: self.out_path = self.query.prefix_path + self.extension elif out_path.endswith('/'): self.out_path = out_path + self.query.prefix + self.extension else: self.out_path = out_path # Make it a file path # self.out_path = FilePath(self.out_path) # Executable # self.executable = FilePath(executable) # Cores to use # if cpus is None: self.cpus = min(multiprocessing.cpu_count(), 32) else: self.cpus = cpus # Save the output somewhere # if self._out is True: self._out = self.out_path + '.stdout' if self._err is True: self._err = self.out_path + '.stderr'
def set_paths(self, base_dir, script_path): """Set the directory, the script path and the outfile path""" # Make absolute paths # if 'change_dir' in self.kwargs: self.kwargs['change_dir'] = DirectoryPath( os.path.abspath(self.kwargs['change_dir'])) if 'out_file' in self.kwargs: self.kwargs['out_file'] = FilePath( os.path.abspath(self.kwargs['out_file'])) # In case there is a base directory # if base_dir is not None: self.base_dir = DirectoryPath(os.path.abspath(base_dir)) self.script_path = FilePath(base_dir + "run." + self.extensions[self.language]) self.kwargs['change_dir'] = base_dir self.kwargs['out_file'] = FilePath(base_dir + "run.out") # Other cases # if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path()) if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path))
def build_tree_fast(self, new_path=None, seq_type='nucl' or 'prot'): """Make a tree with FastTree. Names will be truncated however.""" # Check output # if new_path is None: new_path = self.prefix_path + '.tree' # Command # command_args = [] if seq_type == 'nucl': command_args += ['-nt'] command_args += ['-gamma'] command_args += ['-out', new_path] command_args += [self.path] # Run it # sh.FastTree(*command_args) # Return # return FilePath(new_path)
def mappings(self): """The cog mappings.""" return FilePath(self.p.unzipped_mappings)
def index_samtools(self): """Create an index on the fasta file compatible with samtools.""" sh.samtools('faidx', self.path) return FilePath(self.path + '.fai')
def __init__(self, parent): self.parent = parent self.path = FilePath(self.parent.prefix_path + '_len_hist.pdf')
def new_temp_file(**kwargs): """A new temporary path as a FilePath object.""" handle = tempfile.NamedTemporaryFile(delete=False, **kwargs) path = handle.name handle.close() return FilePath(path)