Пример #1
0
 def __init__(self,
              query_path,
              db_path,
              params=None,
              algorithm="blastn",
              version="plus" or "legacy",
              out_path=None,
              executable=None):
     # Save attributes #
     self.path = query_path
     self.query = FASTA(query_path)
     self.db = FilePath(db_path)
     self.version = version
     self.algorithm = algorithm
     self.params = params if params else {}
     self.executable = FilePath(executable)
     # Output #
     if out_path is None:
         self.out_path = self.query.prefix_path + '.blastout'
     elif out_path.endswith('/'):
         self.out_path = out_path + self.query.prefix + '.blastout'
     else:
         self.out_path = out_path
     self.out_path = FilePath(self.out_path)
     # Defaults #
     self.cpus = multiprocessing.cpu_count()
     if self.version == 'plus':
         if '-num_threads' not in self.params:
             self.params['-num_threads'] = self.cpus
     if self.version == 'legacy':
         if '-a' not in self.params: self.params['-a'] = self.cpus
Пример #2
0
 def __init__(self,
              input_fasta,
              seq_type,
              database,
              algorithm='blast',
              num_threads=None,
              filtering=None,
              out_path=None):
     # Base parameters #
     self.input_fasta = input_fasta
     self.seq_type = seq_type
     self.database = database
     # Optional #
     self.algorithm = algorithm
     # The filtering options #
     if filtering is None: self.filtering = {}
     else: self.filtering = filtering
     # Number of cores to use #
     if num_threads is None: self.num_threads = multiprocessing.cpu_count()
     else: self.num_threads = num_threads
     # Output path #
     if out_path is None:
         self.out_path = FilePath(self.input_fasta.prefix_path + '.' +
                                  algorithm + 'out')
     else:
         self.out_path = FilePath(out_path)
Пример #3
0
 def __init__(self, input_file,
              seq_type      = 'nucl',
              search_algo   = 'blast',
              search_db     = 'nt',
              normalization = 'flat',
              proportional  = True,
              backtracking  = False,
              restrict      = None,
              num_threads   = None,
              out_dir       = None,
              min_identity  = 0.97,
              e_value       = 0.0001,
              max_targets   = 10,
              min_coverage  = 0.97,
              abundances    = None,
              N             = None):
     # Base parameters #
     self.input_file = FASTA(input_file)
     self.input_file.must_exist()
     # Abundance file #
     self.abundances = FilePath(abundances)
     if self.abundances: self.abundances.must_exist()
     # Other parameters #
     self.N = N
     self.seq_type = seq_type
     self.backtracking = bool(backtracking)
     self.proportional = bool(proportional)
     # Normalization parameters #
     options = ('flat', 'ui', 'upui')
     message = 'Normalization has to be one of %s' % (','.join(options))
     if normalization not in options: raise Exception(message)
     self.normalization = normalization
     # Restrict parameter #
     message = "The '--restrict' parameter must be an ENVO term, not '%s'."
     if restrict and not restrict[:5] == 'ENVO:': raise Exception(message % restrict)
     message = "The '--restrict' parameter must be a known ENVO term."
     if restrict and not restrict in self.serial_to_concept.values(): raise Exception(message)
     self.restrict = restrict
     # Search parameters #
     self.search_algo = search_algo
     self.search_db = search_db
     # Number of cores to use #
     if num_threads is None: self.num_threads = min(multiprocessing.cpu_count(), 32)
     else: self.num_threads = int(num_threads)
     self.num_threads = min(self.num_threads, self.input_file.count)
     # Hit filtering parameters #
     self.min_identity = float(min_identity)
     self.e_value      = float(e_value)
     self.max_targets  = int(max_targets)
     self.min_coverage = float(min_coverage)
     # Time the pipeline execution #
     self.timer = Timer()
     # Keep all outputs in a directory #
     if out_dir is None: self.out_dir = self.input_file.directory
     else: self.out_dir = out_dir
     if not self.out_dir.endswith('/'): self.out_dir += '/'
     if not os.path.exists(self.out_dir): os.makedirs(self.out_dir)
     # The object that can make the outputs for the user #
     self.outputs = OutputGenerator(self)
Пример #4
0
 def __init__(self,
              query_path,
              db_path,
              params=None,
              out_path=None,
              executable=None):
     # Save attributes #
     self.query = FASTA(query_path)
     self.db = db_path
     self.params = params if params else {}
     self.executable = FilePath(executable)
     # Output #
     if out_path is None:
         self.out_path = self.query.prefix_path + '.vsearchout'
     elif out_path.endswith('/'):
         self.out_path = out_path + self.query.prefix + '.vsearchout'
     else:
         self.out_path = out_path
     self.out_path = FilePath(self.out_path)
Пример #5
0
 def seq_to_gis(self):
     """A dictionary linking every input sequence to a list of gi identifiers found
     that are relating to it. If a sequence had no hits it links to an empty list.
     NB: You will get a KeyError if there are sequences in the search result files
     that are not present in the inputed fasta."""
     seq_to_gis = FilePath(self.out_dir + 'seq_to_gis.pickle')
     # Check that is was run #
     if not seq_to_gis.exists:
         self.search_results
         print "--> STEP 4: Parsing the search results"
         result = {seq:[] for seq in self.only_top_sequences.ids}
         for hit in self.search_results:
             seq_name = hit[0]
             gi = hit[1].split('|')[1]
             result[seq_name].append(gi)
         with open(seq_to_gis, 'w') as handle: pickle.dump(result, handle)
         self.timer.print_elapsed()
         return result
     # Parse the results #
     with open(seq_to_gis) as handle: return pickle.load(handle)
Пример #6
0
# Internal modules #
from seqenv.common.autopaths import FilePath
from seqenv.common.timer import Timer
from seqenv.common.database import Database

# Third party modules #
from tqdm import trange, tqdm

# Get the directory of this script #
filename = inspect.getframeinfo(inspect.currentframe()).filename
current_dir = os.path.dirname(os.path.abspath(filename)) + '/'
default_data_dir = current_dir + '../data_envo/'

# The default location of the database #
restore_path = FilePath("restore_gi_db.zip")
db_path = FilePath("gi_db.sqlite3")


###############################################################################
class Tagger(object):
    """Interface to the C-coded tagger. Randomly segfaults last
    time I recompiled it and then tried it on a new machine."""
    def __init__(self, entities=None, names=None, globs=None, data_dir=None):
        # Defaults #
        if data_dir is None: data_dir = default_data_dir
        if entities is None: entities = data_dir + 'envo_entities.tsv'
        if names is None: names = data_dir + 'envo_names.tsv'
        if globs is None: globs = data_dir + 'envo_global.tsv'
        # Make an instance of the API #
        import tagger as tagger_api