Exemplo n.º 1
0
 def __init__(self,
              query_path,
              db_path,
              params=None,
              algorithm="blastn",
              version="plus" or "legacy",
              out_path=None,
              executable=None):
     # Save attributes #
     self.path = query_path
     self.query = FASTA(query_path)
     self.db = FilePath(db_path)
     self.version = version
     self.algorithm = algorithm
     self.params = params if params else {}
     self.executable = FilePath(executable)
     # Output #
     if out_path is None:
         self.out_path = self.query.prefix_path + '.blastout'
     elif out_path.endswith('/'):
         self.out_path = out_path + self.query.prefix + '.blastout'
     else:
         self.out_path = out_path
     self.out_path = FilePath(self.out_path)
     # Defaults #
     self.cpus = multiprocessing.cpu_count()
     if self.version == 'plus':
         if '-num_threads' not in self.params:
             self.params['-num_threads'] = self.cpus
     if self.version == 'legacy':
         if '-a' not in self.params: self.params['-a'] = self.cpus
Exemplo n.º 2
0
 def only_top_sequences(self):
     """Make a new fasta file where only the top N sequences are included
     (in terms of their abundance). Skipped if no abundance info is given."""
     if not self.abundances: return self.renamed_fasta
     if self.N is None: return self.renamed_fasta
     # Parse it #
     N = int(self.N)
     # Create file #
     only_top_fasta = FASTA(self.out_dir + 'top_seqs.fasta')
     # Print status #
     print "Using: " + self.renamed_fasta
     print "--> STEP 1B: Get the top %i sequences (in terms of their abundances)." % N
     # Check the user inputted value #
     if N > self.input_file.count:
         msg = "You asked for the top %i sequences"
         msg += ", but your input file only contains %i sequences!"
         msg = msg % (self.N, self.input_file.count)
         warnings.warn(msg, UserWarning)
         N = self.input_file.count
     # Do it #
     ids = self.df_abundances.sum(axis=1).sort_values(
         ascending=False).index[0:N]
     ids = set([self.orig_names_to_renamed[x] for x in ids])
     self.renamed_fasta.extract_sequences(only_top_fasta, ids)
     self.timer.print_elapsed()
     return only_top_fasta
Exemplo n.º 3
0
 def __init__(self, input_file,
              seq_type      = 'nucl',
              search_algo   = 'blast',
              search_db     = 'nt',
              normalization = 'flat',
              proportional  = True,
              backtracking  = False,
              restrict      = None,
              num_threads   = None,
              out_dir       = None,
              min_identity  = 0.97,
              e_value       = 0.0001,
              max_targets   = 10,
              min_coverage  = 0.97,
              abundances    = None,
              N             = None):
     # Base parameters #
     self.input_file = FASTA(input_file)
     self.input_file.must_exist()
     # Abundance file #
     self.abundances = FilePath(abundances)
     if self.abundances: self.abundances.must_exist()
     # Other parameters #
     self.N = N
     self.seq_type = seq_type
     self.backtracking = bool(backtracking)
     self.proportional = bool(proportional)
     # Normalization parameters #
     options = ('flat', 'ui', 'upui')
     message = 'Normalization has to be one of %s' % (','.join(options))
     if normalization not in options: raise Exception(message)
     self.normalization = normalization
     # Restrict parameter #
     message = "The '--restrict' parameter must be an ENVO term, not '%s'."
     if restrict and not restrict[:5] == 'ENVO:': raise Exception(message % restrict)
     message = "The '--restrict' parameter must be a known ENVO term."
     if restrict and not restrict in self.serial_to_concept.values(): raise Exception(message)
     self.restrict = restrict
     # Search parameters #
     self.search_algo = search_algo
     self.search_db = search_db
     # Number of cores to use #
     if num_threads is None: self.num_threads = min(multiprocessing.cpu_count(), 32)
     else: self.num_threads = int(num_threads)
     self.num_threads = min(self.num_threads, self.input_file.count)
     # Hit filtering parameters #
     self.min_identity = float(min_identity)
     self.e_value      = float(e_value)
     self.max_targets  = int(max_targets)
     self.min_coverage = float(min_coverage)
     # Time the pipeline execution #
     self.timer = Timer()
     # Keep all outputs in a directory #
     if out_dir is None: self.out_dir = self.input_file.directory
     else: self.out_dir = out_dir
     if not self.out_dir.endswith('/'): self.out_dir += '/'
     if not os.path.exists(self.out_dir): os.makedirs(self.out_dir)
     # The object that can make the outputs for the user #
     self.outputs = OutputGenerator(self)
Exemplo n.º 4
0
 def renamed_fasta(self):
     """Make a new fasta file where every name in the input FASTA file is replaced
     with "C1", "C2", "C3" etc. Returns this new FASTA file."""
     renamed_fasta = FASTA(self.out_dir + 'renamed.fasta')
     if renamed_fasta.exists: return renamed_fasta
     print "--> STEP 1: Parse the input FASTA file."
     self.input_file.rename_sequences(renamed_fasta, self.orig_names_to_renamed)
     self.timer.print_elapsed()
     return renamed_fasta
Exemplo n.º 5
0
 def __init__(self,
              query_path,
              db_path,
              params=None,
              out_path=None,
              executable=None):
     # Save attributes #
     self.query = FASTA(query_path)
     self.db = db_path
     self.params = params if params else {}
     self.executable = FilePath(executable)
     # Output #
     if out_path is None:
         self.out_path = self.query.prefix_path + '.vsearchout'
     elif out_path.endswith('/'):
         self.out_path = out_path + self.query.prefix + '.vsearchout'
     else:
         self.out_path = out_path
     self.out_path = FilePath(self.out_path)
Exemplo n.º 6
0
 def __init__(self, path, num_parts=None, part_size=None, base_dir=None):
     # Basic #
     self.path = path
     # Directory #
     if base_dir is None: self.base_dir = DirectoryPath(path + '.parts/')
     else: self.base_dir = DirectoryPath(base_dir)
     # Num parts #
     if num_parts is not None: self.num_parts = num_parts
     # Evaluate size #
     if part_size is not None:
         self.bytes_target = part_size  #humanfriendly.parse_size(part_size)
         self.num_parts = int(
             math.ceil(self.count_bytes / self.bytes_target))
     # Make parts #
     self.make_name = lambda i: self.base_dir + "%03d/part.fasta" % i
     self.parts = [
         FASTA(self.make_name(i)) for i in range(1, self.num_parts + 1)
     ]
     # Give a number to each part #
     for i, part in enumerate(self.parts):
         part.num = i
Exemplo n.º 7
0
"""
========================
Generate fake abundances
========================
"""

# Modules #
import os, inspect, numpy, pandas, names
from seqenv.fasta import FASTA

# Constants #
current_script = inspect.getframeinfo(inspect.currentframe()).filename
current_dir = os.path.dirname(os.path.abspath(current_script)) + '/'
fasta = FASTA(current_dir + "../examples/samples/community.fasta")


################################################################################
def data():
    """Create some fake data in a dataframe"""
    x_size = len(fasta)
    y_size = 10
    numpy.random.seed(0)
    M = numpy.random.randint(0, 1000, (x_size, y_size))
    df = pandas.DataFrame(
        M,
        index=[seq.id for seq in fasta],
        columns=[names.get_first_name() for j in range(y_size)])
    return df


df = data()