def __init__(self, query_path, db_path, params=None, algorithm="blastn", version="plus" or "legacy", out_path=None, executable=None): # Save attributes # self.path = query_path self.query = FASTA(query_path) self.db = FilePath(db_path) self.version = version self.algorithm = algorithm self.params = params if params else {} self.executable = FilePath(executable) # Output # if out_path is None: self.out_path = self.query.prefix_path + '.blastout' elif out_path.endswith('/'): self.out_path = out_path + self.query.prefix + '.blastout' else: self.out_path = out_path self.out_path = FilePath(self.out_path) # Defaults # self.cpus = multiprocessing.cpu_count() if self.version == 'plus': if '-num_threads' not in self.params: self.params['-num_threads'] = self.cpus if self.version == 'legacy': if '-a' not in self.params: self.params['-a'] = self.cpus
def __init__(self, input_fasta, seq_type, database, algorithm='blast', num_threads=None, filtering=None, out_path=None): # Base parameters # self.input_fasta = input_fasta self.seq_type = seq_type self.database = database # Optional # self.algorithm = algorithm # The filtering options # if filtering is None: self.filtering = {} else: self.filtering = filtering # Number of cores to use # if num_threads is None: self.num_threads = multiprocessing.cpu_count() else: self.num_threads = num_threads # Output path # if out_path is None: self.out_path = FilePath(self.input_fasta.prefix_path + '.' + algorithm + 'out') else: self.out_path = FilePath(out_path)
def __init__(self, input_file, seq_type = 'nucl', search_algo = 'blast', search_db = 'nt', normalization = 'flat', proportional = True, backtracking = False, restrict = None, num_threads = None, out_dir = None, min_identity = 0.97, e_value = 0.0001, max_targets = 10, min_coverage = 0.97, abundances = None, N = None): # Base parameters # self.input_file = FASTA(input_file) self.input_file.must_exist() # Abundance file # self.abundances = FilePath(abundances) if self.abundances: self.abundances.must_exist() # Other parameters # self.N = N self.seq_type = seq_type self.backtracking = bool(backtracking) self.proportional = bool(proportional) # Normalization parameters # options = ('flat', 'ui', 'upui') message = 'Normalization has to be one of %s' % (','.join(options)) if normalization not in options: raise Exception(message) self.normalization = normalization # Restrict parameter # message = "The '--restrict' parameter must be an ENVO term, not '%s'." if restrict and not restrict[:5] == 'ENVO:': raise Exception(message % restrict) message = "The '--restrict' parameter must be a known ENVO term." if restrict and not restrict in self.serial_to_concept.values(): raise Exception(message) self.restrict = restrict # Search parameters # self.search_algo = search_algo self.search_db = search_db # Number of cores to use # if num_threads is None: self.num_threads = min(multiprocessing.cpu_count(), 32) else: self.num_threads = int(num_threads) self.num_threads = min(self.num_threads, self.input_file.count) # Hit filtering parameters # self.min_identity = float(min_identity) self.e_value = float(e_value) self.max_targets = int(max_targets) self.min_coverage = float(min_coverage) # Time the pipeline execution # self.timer = Timer() # Keep all outputs in a directory # if out_dir is None: self.out_dir = self.input_file.directory else: self.out_dir = out_dir if not self.out_dir.endswith('/'): self.out_dir += '/' if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) # The object that can make the outputs for the user # self.outputs = OutputGenerator(self)
def __init__(self, query_path, db_path, params=None, out_path=None, executable=None): # Save attributes # self.query = FASTA(query_path) self.db = db_path self.params = params if params else {} self.executable = FilePath(executable) # Output # if out_path is None: self.out_path = self.query.prefix_path + '.vsearchout' elif out_path.endswith('/'): self.out_path = out_path + self.query.prefix + '.vsearchout' else: self.out_path = out_path self.out_path = FilePath(self.out_path)
def seq_to_gis(self): """A dictionary linking every input sequence to a list of gi identifiers found that are relating to it. If a sequence had no hits it links to an empty list. NB: You will get a KeyError if there are sequences in the search result files that are not present in the inputed fasta.""" seq_to_gis = FilePath(self.out_dir + 'seq_to_gis.pickle') # Check that is was run # if not seq_to_gis.exists: self.search_results print "--> STEP 4: Parsing the search results" result = {seq:[] for seq in self.only_top_sequences.ids} for hit in self.search_results: seq_name = hit[0] gi = hit[1].split('|')[1] result[seq_name].append(gi) with open(seq_to_gis, 'w') as handle: pickle.dump(result, handle) self.timer.print_elapsed() return result # Parse the results # with open(seq_to_gis) as handle: return pickle.load(handle)
# Internal modules # from seqenv.common.autopaths import FilePath from seqenv.common.timer import Timer from seqenv.common.database import Database # Third party modules # from tqdm import trange, tqdm # Get the directory of this script # filename = inspect.getframeinfo(inspect.currentframe()).filename current_dir = os.path.dirname(os.path.abspath(filename)) + '/' default_data_dir = current_dir + '../data_envo/' # The default location of the database # restore_path = FilePath("restore_gi_db.zip") db_path = FilePath("gi_db.sqlite3") ############################################################################### class Tagger(object): """Interface to the C-coded tagger. Randomly segfaults last time I recompiled it and then tried it on a new machine.""" def __init__(self, entities=None, names=None, globs=None, data_dir=None): # Defaults # if data_dir is None: data_dir = default_data_dir if entities is None: entities = data_dir + 'envo_entities.tsv' if names is None: names = data_dir + 'envo_names.tsv' if globs is None: globs = data_dir + 'envo_global.tsv' # Make an instance of the API # import tagger as tagger_api