def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    # get cmd-line options
    fasta_fp=opts.input_fasta_fp
    qual_fp=opts.input_qual_fp
    output_dir=opts.output_dir
    
    # create output dir
    create_dir(output_dir)
    output_fps={}

    # open sequence files
    sequences=MinimalFastaParser(open(fasta_fp,'U'))
    qual_sequences=MinimalFastaParser(open(qual_fp,'U'))
    
    # iterate over seqs
    for seq_name,seq in sequences:
        
        # iterate over qual
        qual_seq_name,qual_seq=qual_sequences.next()
                
        # verify headers from seq and qual match
        if seq_name==qual_seq_name:
            # get the SampleID
            samp_id='_'.join(seq_name.split()[0].split('_')[:-1])
            samp_filename='seqs_%s' % (str(samp_id))
            # open files for output
            if not output_fps.has_key(str(samp_filename)):
                output_fps[str(samp_filename)] = open(join(output_dir,
                                        '%s.fastq' % (str(samp_filename))),'w')
            
            # write out the fastq format for seqs
            output_fps[str(samp_filename)].write('@%s\n%s\n+\n%s\n' % \
                                                 (seq_name,seq,qual_seq))
        else:
            print seq_name
    
    # close the files
    for s_id in output_fps:
        output_fps[str(s_id)].close()
def generate_full_split_lib_fastq(study, study_input_dir, zip_fname,
                                 files_to_remove,output_dir):
    """ Generate the full split-library fastq file """
    
    # define sequence output file
    seq_fname='study_%s_split_library_seqs.fastq.gz' % (str(study))
    fna_fname='study_%s_split_library_seqs.fna.gz' % (str(study))
    output_seq_fp=join(output_dir,seq_fname)
    output_fna_fp=join(output_dir,fna_fname)
    # add to list of files to remove
    files_to_remove.append(output_seq_fp)
    files_to_remove.append(output_fna_fp)
    
    output_seqs=gzip.open(output_seq_fp,'w')
    output_fna=gzip.open(output_fna_fp,'w')
    iterator=0
    
    # get a list of all files in study_dir
    processed_folders=listdir(study_input_dir)
    samples={}
    biom_files=[]
    for processed_folder in processed_folders:
        # determine if the file startswith the word "processed"
        if processed_folder.startswith('processed'):
            
            # define split-lib seq fp
            split_lib_seqs=join(study_input_dir,processed_folder,
                                'split_libraries','seqs.fna')
            
            # open sequence files
            seqs=MinimalFastaParser(open(split_lib_seqs,'U'))
            
            try:
                # for illumina
                split_lib_qual=join(study_input_dir,processed_folder,
                                    'split_libraries','seqs.qual')
                # open sequence files
                qual_sequences=MinimalFastaParser(open(split_lib_qual,'U'))
            except:
                # for 454
                split_lib_qual=join(study_input_dir,processed_folder,
                                    'split_libraries','seqs_filtered.qual')
                # open sequence files
                qual_sequences=MinimalFastaParser(open(split_lib_qual,'U'))
                
            # open split-lib seq fp
            seqs=MinimalFastaParser(open(split_lib_seqs,'U'))
            # iterate over sequences
            for seq_name,seq in seqs:
                # update sequence numbers since they may cause issues across
                # multiple split-lib runs
                qual_seq_name,qual_seq=qual_sequences.next()
                if seq_name==qual_seq_name:
                    
                    full_seq_name_list=seq_name.split()
                    seq_name_prefix='_'.join(full_seq_name_list[0].split('_')[:-1])
                
                    # get per sample sequence counts
                    if seq_name_prefix in samples:
                        samples[seq_name_prefix]=samples[seq_name_prefix]+1
                    else:
                        samples[seq_name_prefix]=1
                
                    # update the sequence name, but retain barcode info
                    updated_seq_name=seq_name_prefix + '_' + str(iterator) + \
                                     ' ' + ' '.join(full_seq_name_list[1:])
                
                    # write the sequence out in FASTA format
                    output_seqs.write('@%s\n%s\n+\n%s\n' % \
                            (str(updated_seq_name), str(seq), str(qual_seq)))
                    # write the sequence out in FASTA format
                    output_fna.write('>%s\n%s\n' % (str(updated_seq_name),
                                                     str(seq)))
                    iterator=iterator+1
                else:
                    print seq_name
            
            # get list of biom files
            gg_biom_fp=join(study_input_dir,processed_folder,
                            'gg_97_otus','exact_uclust_ref_otu_table.biom')
                            
            if exists(gg_biom_fp) and getsize(gg_biom_fp)>0:
                biom_files.append(gg_biom_fp)
                
    output_seqs.close()
    output_fna.close()
    
    # zip the full split-library sequence file
    #cmd_call='cd %s; tar rzvf %s %s' % (study_input_dir,zip_fname,seq_fname)
    #cmd_call='cd %s; tar rzvf %s %s' % (study_input_dir,zip_fname,fna_fname)
    #system(cmd_call)
    
    return files_to_remove, biom_files, samples
Пример #3
0
def assign_dna_reads_to_protein_database(query_fasta_fp,
                                         database_fasta_fp,
                                         output_fp,
                                         temp_dir="/tmp",
                                         params=None):
    """Assign DNA reads to a database fasta of protein sequences.

    Wraps assign_reads_to_database, setting database and query types. All
    parameters are set to default unless params is passed. A temporary
    file must be written containing the translated sequences from the input
    query fasta file because BLAT cannot do this automatically.

    query_fasta_fp: absolute path to the query fasta file containing DNA
                   sequences.
    database_fasta_fp: absolute path to the database fasta file containing
                      protein sequences.
    output_fp: absolute path where the output file will be generated.
    temp_dir: optional. Change the location where the translated sequences
              will be written before being used as the query. Defaults to 
              /tmp.
    params: optional. dict containing parameter settings to be used
                  instead of default values. Cannot change database or query
                  file types from protein and dna, respectively.

    This method returns an open file object. The output format
    defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
    """
    if params is None:
        params = {}

    my_params = {'-t': 'prot', '-q': 'prot'}

    # make sure temp_dir specifies an absolute path
    if not isabs(temp_dir):
        raise ApplicationError("temp_dir must be an absolute path.")

    # if the user specified parameters other than default, then use them.
    # However, if they try to change the database or query types, raise an
    # applciation error.
    if '-t' in params or '-q' in params:
        raise ApplicationError(
            "Cannot change database or query types "
            "when using assign_dna_reads_to_dna_database. Use "
            "assign_reads_to_database instead.")

    if 'genetic_code' in params:
        my_genetic_code = GeneticCodes[params['genetic_code']]
        del params['genetic_code']
    else:
        my_genetic_code = GeneticCodes[1]

    my_params.update(params)

    # get six-frame translation of the input DNA sequences and write them to
    # temporary file.
    tmp = get_tmp_filename(tmp_dir=temp_dir, result_constructor=str)
    tmp_out = open(tmp, 'w')

    for label, sequence in MinimalFastaParser(open(query_fasta_fp)):
        seq_id = label.split()[0]

        s = DNA.makeSequence(sequence)
        translations = my_genetic_code.sixframes(s)
        frames = [1, 2, 3, -1, -2, -3]
        translations = dict(zip(frames, translations))

        for frame, translation in sorted(translations.iteritems()):
            entry = '>{seq_id}_frame_{frame}\n{trans}\n'
            entry = entry.format(seq_id=seq_id, frame=frame, trans=translation)
            tmp_out.write(entry)

    tmp_out.close()
    result = assign_reads_to_database(tmp, database_fasta_fp, output_fp, \
                                      params = my_params)

    remove(tmp)

    return result
Пример #4
0
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(**script_info)

    parameters = {}

    # get the tree insertion method to use
    module = opts.insertion_method

    # create output directory
    output_dir = opts.output_dir
    create_dir(output_dir)

    # list of tree insertion methods
    tree_insertion_module_names = \
        {'raxml_v730': cogent.app.raxml_v730,
         'parsinsert': cogent.app.parsinsert,
         'pplacer': cogent.app.pplacer}

    # load input sequences and convert to phylip since the tools require
    # the query sequences to phylip-compliant names
    load_aln = MinimalFastaParser(open(opts.input_fasta_fp, 'U'))
    aln = DenseAlignment(load_aln)
    seqs, align_map = aln.toPhylip()

    if opts.method_params_fp:
        param_dict = parse_qiime_parameters(open(opts.method_params_fp, 'U'))

    if module == 'raxml_v730':
        # load the reference sequences
        load_ref_aln = \
            DenseAlignment(MinimalFastaParser(open(opts.refseq_fp, 'U')))

        # combine and load the reference plus query
        combined_aln = MinimalFastaParser(StringIO(load_ref_aln.toFasta() +
                                                   '\n' + aln.toFasta()))
        # overwrite the alignment map
        aln = DenseAlignment(combined_aln)
        seqs, align_map = aln.toPhylip()

        try:
            parameters = param_dict['raxml']
        except:
            parameters = {}

        tree = convert_tree_tips(align_map, opts.starting_tree_fp)

        # write out the tree with phylip labels
        updated_tree_fp = join(output_dir,
                               '%s_phylip_named_tree.tre' % (module))
        write_updated_tree_file(updated_tree_fp, tree)

        # set the primary parameters for raxml
        parameters['-w'] = abspath(output_dir) + '/'
        parameters["-n"] = split(splitext(get_tmp_filename())[0])[-1]
        parameters["-t"] = updated_tree_fp

        if "-f" not in parameters:
            parameters["-f"] = 'v'
        if "-m" not in parameters:
            parameters["-m"] = 'GTRGAMMA'

    elif module == 'pplacer':
        try:
            parameters = param_dict['pplacer']
        except:
            parameters = {}

        # make sure stats file is passed
        if not opts.stats_fp:
            raise IOError(
                'When using pplacer, the RAxML produced info file is required.')

        # set the primary parameters for pplacer - allow for user-defined
        parameters['--out-dir'] = abspath(output_dir) + '/'
        parameters["-t"] = opts.starting_tree_fp
        parameters['-r'] = opts.refseq_fp
        parameters['-s'] = opts.stats_fp

    elif module == 'parsinsert':
        try:
            parameters = param_dict['parsinsert']
        except:
            parameters = {}

        # define log fp
        log_fp = join(output_dir, 'parsinsert.log')

        # define tax assignment values fp
        tax_assign_fp = join(output_dir, 'parsinsert_assignments.log')
        parameters["-l"] = log_fp
        parameters["-o"] = tax_assign_fp
        parameters["-s"] = opts.refseq_fp
        parameters["-t"] = opts.starting_tree_fp

    # call the module and return a tree object
    result = \
        tree_insertion_module_names[module].insert_sequences_into_tree(seqs,
                                                                       moltype=DNA, params=parameters)

    result_tree = strip_and_rename_unwanted_labels_from_tree(align_map, result)

    # write out the resulting tree
    final_tree = join(output_dir, '%s_final_placement.tre' % (module))
    write_updated_tree_file(final_tree, result)
Пример #5
0
from cogent.parse.fasta import MinimalFastaParser

if __name__ == '__main__':

    metric_lines = [
        l.strip().split('\t') for l in open(argv[1]) if not l.startswith('#')
    ]
    chi_lines = [
        l.strip().split('\t') for l in open(argv[2]) if not l.startswith('#')
    ]

    chi = set([r[0] for r in chi_lines])
    metrics = dict([(id_, (float(inv), float(nonacgt))) \
                        for id_,inv,nonacgt in metric_lines])

    seqs = MinimalFastaParser(open(argv[3]))
    output_seqs = open(argv[4], 'w')
    output_reasons = open(argv[4] + '.filtered', 'w')
    output_reasons.write("#ncbi_acc_w_ver\treason\n")

    for i, s in seqs:
        if i in chi:
            output_reasons.write("%s\tchimeric\n" % i)
            continue

        inv, nonacgt = metrics[i]

        if inv < 0.9:
            output_reasons.write(i + "\tinvariants < 90%")
            output_reasons.write("\n")
            continue
Пример #6
0
def parse_fasta(lines):
    """lightweight parser for KEGG FASTA format sequences"""
    for label, seq in MinimalFastaParser(lines):
        yield '\t'.join(list(kegg_label_fields(label)) \
          + [seq] + ["\n"])
Пример #7
0
def convert_fastq(fasta_file_path,
                  qual_file_path,
                  output_directory='.',
                  multiple_output_files=False,
                  ascii_increment=33,
                  full_fastq=False,
                  full_fasta_headers=False,
                  per_file_buffer_size=100000):
    '''Takes a FASTA and QUAL file, generates FASTQ file(s)
    
    fasta_file_path:  filepath of input FASTA file.
    qual_file_path:  filepath of input QUAL file (needed for making FASTQ files)
    output_directory:  Directory to output converted files.
    multiple_output_files:  Make one file per SampleID.
    ascii_increment:  Conversion value for fastq ascii character to numeric
     quality score.
    full_fastq:  Write labels to both sequence and quality score lines.
    full_fasta_headers:  Retain all data on fasta label, instead of breaking at
     first whitespace.'''

    fasta_file = open(fasta_file_path, 'U')
    qual_file = open(qual_file_path, 'U')

    # if we're not using multiple output files, we can open the one (and only)
    # output file right now
    if not multiple_output_files:
        output_file_path = get_filename_with_new_ext(fasta_file_path, '.fastq',
                                                     output_directory)

        fastq_file = open(output_file_path, 'w')
    else:
        fastq_lookup = defaultdict(str)

    # iterate through the FASTA and QUAL files entry by entry (assume the
    # entries are synchronized)
    for fasta_data, qual_data in izip(MinimalFastaParser(fasta_file),
                                      MinimalQualParser(qual_file)):

        qual_header = qual_data[0]
        fasta_header = fasta_data[0]

        label = fasta_header.split()[0]
        sample_id = label.split('_')[0]

        sequence = fasta_data[1]
        qual = qual_data[1]

        # check whether the entries are actually (at least nominally) synch'd
        if qual_header != label:
            raise KeyError, ("QUAL header (%s) does not match "
                             "FASTA header (%s)") % (qual_header, label)

        if len(sequence) != len(qual):
            raise KeyError, ("Sequence length does not match QUAL length for "
                             "label (%s)") % label

        if multiple_output_files:
            output_file_path = get_filename_with_new_ext(
                fasta_file_path, '_' + sample_id + '.fastq', output_directory)

            # when we use multiple output files, we close each file after each
            # sequence is written to avoid using up all the file handles, so
            # we must open the file each time in append mode
            # fastq_file = open(output_file_path, 'a')

        if full_fasta_headers:
            fastq_sequence_header = fasta_header
        else:
            fastq_sequence_header = label

        if full_fastq:
            fastq_quality_header = fastq_sequence_header
        else:
            fastq_quality_header = ''

        #Writing to FASTQ file
        record = '@%s\n%s\n+%s\n' % (fastq_sequence_header, sequence,
                                     fastq_quality_header)

        if multiple_output_files:
            fastq_lookup[output_file_path] += record
        else:
            fastq_file.write(record)

        for qual_score in qual:
            # increment the qual score by the asciiIncrement (default 33),
            # and print the corresponding character, which represents that
            # position's quality.
            qual_score += ascii_increment
            if qual_score < 32 or qual_score > 126:
                raise ValueError, (
                    "Cannot convert quality score to ASCII code" +
                    " between 32 and 126: " +
                    str(qual_score - ascii_increment) +
                    "using ascii_increment = " + str(ascii_increment))

            if multiple_output_files:
                fastq_lookup[output_file_path] += chr(qual_score)
            else:
                fastq_file.write(chr(qual_score))

        if multiple_output_files:
            fastq_lookup[output_file_path] += '\n'
        else:
            fastq_file.write('\n')

        if multiple_output_files:
            if len(fastq_lookup[output_file_path]) >= per_file_buffer_size:
                fastq_file = open(output_file_path, 'a')
                fastq_file.write(fastq_lookup[output_file_path])
                fastq_lookup[output_file_path] = ''
                fastq_file.close()

    # write last seqs to output files, or close the output file if thre is only
    # one
    if multiple_output_files:
        for output_file_path, records in fastq_lookup.iteritems():
            if records:
                fastq_file = open(output_file_path, 'a')
                fastq_file.write(records)
                fastq_file.close()
    else:
        fastq_file.close()
Пример #8
0
    def __call__(self,
                 seq_path,
                 otu_path,
                 reference_path,
                 result_path=None,
                 log_path=None,
                 sort_by='otu'):
        """Returns dict mapping {otu_id:[seq_ids]} for each otu.
        
        Parameters:
        seq_path: path to file of sequences
        otu_path: path to file of OTUs
        result_path: path to file of results. If specified,
        dumps the result to the desired path instead of returning it.
        log_path: path to log, which includes dump of params.
        sort_by: sort by otu or seq_id
        """
        # Load the seq path. We may want to change that in the future
        # to avoid the overhead of loading large sequence collections
        # during this step.
        if seq_path:
            seq_f = open(seq_path, 'U')
            seqs = dict(MinimalFastaParser(seq_f, label_to_name=label_to_name))
            seq_f.close()
        else:
            # allows the user to not pass seqs, which can be useful when
            # all otus are based on reference sequences
            seqs = {}

        # Load the reference_path. We may want to change that in the future
        # to avoid the overhead of loading large sequence collections
        # during this step.
        reference_f = open(reference_path, 'U')
        reference_seqs = dict(\
         MinimalFastaParser(reference_f,label_to_name=label_to_name))
        reference_f.close()

        #Load the otu file
        otu_f = open(otu_path, 'U')
        otus = fields_to_dict(otu_f)
        otu_f.close()

        if self.Params['ChoiceFRequiresSeqs']:
            choice_f = self.Params['ChoiceF'](seqs)
        else:
            choice_f = self.Params['ChoiceF']

        #actually pick the set
        result = {}
        for set_id, ids in otus.items():
            if set_id in reference_seqs:
                result[set_id] = (reference_seqs, set_id)
            elif seqs:
                result[set_id] = (seqs, choice_f(ids, seqs))
            else:
                raise KeyError,\
                 "Unknown reference sequence identifier: %s\n" % set_id +\
                 "Have you provided the correct reference sequence file? " +\
                 "Did you forget to provide a seqs filepath for de novo OTUs?"

        if result_path:
            of = open(result_path, 'w')
            if sort_by == 'seq_id':

                def key(s):
                    try:
                        return int(s[1].split('_', 1)[-1])
                    except ValueError:
                        return s
            else:
                key = lambda s: s
            for cluster, rep in sorted(result.items(), key=key):
                seq_lookup, id_ = rep
                try:
                    of.write('>%s %s\n%s\n' % (cluster, id_, seq_lookup[id_]))
                except KeyError:
                    raise KeyError,\
                     "Sequence identifiers (%s and %s) "  % (cluster, id_) +\
                     "not found in reference or sequence collection."
            of.close()
            result = None
            log_str = 'Result path: %s' % result_path
        else:
            # The return value here differs from GenericRepSetPicker
            # because it is possible for the representative sequences
            # to be ambiguous. For example, if the identifiers in
            # seq_path and reference_path are both integers, returning
            # a sequence identifier is not sufficent to determine which
            # sequence collection the reference sequence came from.
            # Therefore if the user did not provide a result_path, store
            # the result in a dict of {otu_id: (rep_id, rep_seq)},
            log_str = 'Result path: None, returned as dict.'

            for cluster, rep in result.items():
                seq_lookup, id_ = rep
                try:
                    result[cluster] = (id_, seq_lookup[id_])
                except KeyError:
                    raise KeyError,\
                     "Sequence identifiers (%s and %s) "  % (cluster, id_) +\
                     "not found in reference or sequence collection."

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, 'w')
            log_file.write(str(self))
            log_file.write('\n')
            log_file.write('%s\n' % log_str)

        # return the result (note this is None if the data was
        # written to file)
        return result
Пример #9
0
    def __call__(self,
                 seq_path=None,
                 seqs=None,
                 result_path=None,
                 log_path=None):
        """Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq.
        """
        assert seq_path or seqs, \
         "Must provide either seqs or seq_path when calling a BlastTaxonAssigner."

        # initialize the logger
        logger = self._get_logger(log_path)
        logger.info(str(self))

        # assign the blast database, either as a pre-exisiting database
        # specified as self.Params['blast_db'] or by creating a
        # temporary database from the sequence file specified
        # as self.Params['reference_seqs_filepath']
        try:
            blast_db = self.Params['blast_db']
        except KeyError:
            # build a temporary blast_db
            reference_seqs_path = self.Params['reference_seqs_filepath']
            refseqs_dir, refseqs_name = os.path.split(reference_seqs_path)
            blast_db, db_files_to_remove = \
             build_blast_db_from_fasta_path(reference_seqs_path)

        # build the mapping of sequence identifier
        # (wrt to the blast db seqs) to taxonomy
        id_to_taxonomy_map = self._parse_id_to_taxonomy_file(\
         open(self.Params['id_to_taxonomy_filepath'],'U'))

        ## Iterate over the input self.SeqsPerBlastRun seqs at a time.
        # There are two competing issues here when dealing with very large
        # inputs. If all sequences are read in at once, the containing object
        # can be very large, causing the system to page. On the other hand,
        # in such cases it would be very slow to treat each sequence
        # individually, since blast requires a filepath. Each call would
        # therefore involve writing a single sequence to file, opening/closing
        # and removing the file. To balance this, sequences are read in and
        # blasted in chunks of self.SeqsPerBlastRun (defualt: 1000) at a time.
        # This appears to solve the problem with the largest sets I've worked
        # with so far.

        if seq_path:
            # Get a seq iterator
            seqs = MinimalFastaParser(open(seq_path))
        # Build object to keep track of the current set of sequence to be
        # blasted, and the results (i.e., seq_id -> (taxonomy,quaility score)
        # mapping)
        current_seqs = []
        result = {}

        # Iterate over the (seq_id, seq) pairs
        for seq_id, seq in seqs:
            # append the current seq_id,seq to list of seqs to be blasted
            current_seqs.append((seq_id, seq))

            # When there are 1000 in the list, blast them
            if len(current_seqs) == self.SeqsPerBlastRun:
                # update the result object
                result.update(self._seqs_to_taxonomy(\
                 current_seqs,blast_db,id_to_taxonomy_map))
                # reset the list of seqs to be blasted
                current_seqs = []
        # Assign taxonomy to the remaining sequences
        result.update(self._seqs_to_taxonomy(\
         current_seqs,blast_db,id_to_taxonomy_map))
        ## End iteration over the input self.SeqsPerBlastRun seqs at a time.

        # Write log data if we have a path (while the logger can handle
        # being called if we are not logging, some of these steps are slow).
        if log_path is not None:
            num_inspected = len(result)
            logger.info('Number of sequences inspected: %s' % num_inspected)
            num_null_hits = [r[1] for r in result.values()].count(None)
            logger.info('Number with no blast hits: %s' % num_null_hits)

        if result_path:
            # if the user provided a result_path, write the
            # results to file
            of = open(result_path, 'w')
            for seq_id, (lineage, confidence, blast_hit_id) in result.items():
                of.write('%s\t%s\t%s\t%s\n' %
                         (seq_id, lineage, confidence, blast_hit_id))
            of.close()
            result = None
            logger.info('Result path: %s' % result_path)
        else:
            # Returning the data as a dict, so no modification to result
            # is necessary.
            pass

            # if no result_path was provided, return the data as a dict
            logger.info('Result path: None, returned as dict.')

        # clean-up temp blastdb files, if a temp blastdb was created
        if 'reference_seqs_filepath' in self.Params:
            map(remove, db_files_to_remove)

        # return the result
        return result
Пример #10
0
def denoise_per_sample(sff_fp, fasta_fp, tmpoutdir, cluster=False,
                       num_cpus=1, squeeze=True, percent_id=0.97, bail=1,
                       primer="", low_cutoff=3.75, high_cutoff=4.5,
                       log_fp="denoiser.log", low_memory=False, verbose=False,
                       error_profile=DENOISER_DATA_DIR +'FLX_error_profile.dat',
                       max_num_rounds=None, titanium=False):
    """Denoise each sample separately"""

    #abort early if binary is missing
    check_flowgram_ali_exe()

    log_fh = None    
    if log_fp:
        #switch of buffering for global log file
        log_fh = open(tmpoutdir+"/"+log_fp, "w", 0)

    # overwrite settings if titanium is set
    # This flag is only used from qiime. Remove after qiime integration
    if titanium:
        error_profile = DENOISER_DATA_DIR + "Titanium_error_profile.dat"
        low_cutoff  = 4
        high_cutoff = 5

    if verbose:
        log_fh.write("Denoiser version: %s\n" % __version__)
        log_fh.write("SFF file: %s\n" % sff_fp)
        log_fh.write("Fasta file: %s\n" % fasta_fp)
        log_fh.write("Cluster: %s\n" % cluster)
        log_fh.write("Num CPUs: %d\n" % num_cpus)
        log_fh.write("Squeeze Seqs: %s\n" % squeeze)
        log_fh.write("tmpdir: %s\n\n" % tmpoutdir)
        log_fh.write("percent_id threshold: %.2f\n" % percent_id)
        log_fh.write("Minimal sequence coverage for first phase: %d\n" % bail)            
        log_fh.write("Error profile: %s\n" % error_profile)
        log_fh.write("Maximal number of iteration: %s\n\n" % max_num_rounds)
        
    # here we go ...
    sff_files = split_sff(open(sff_fp), open(fasta_fp), tmpoutdir) 
    combined_mapping = {}
    result_centroids = []
    result_singletons_files = []
    #denoise each sample separately
    for i, sff_file in enumerate(sff_files):
        if not exists(tmpoutdir+("/%d" % i)):
            makedirs(tmpoutdir+("/%d" % i))
        out_fp= tmpoutdir+("/%d/" % i)
        denoise_seqs(sff_file, fasta_fp, out_fp, None, cluster,
                     num_cpus, squeeze, percent_id, bail, primer,
                     low_cutoff, high_cutoff, log_fp, low_memory,
                     verbose, error_profile, max_num_rounds)

        #collect partial results
        this_rounds_mapping = read_denoiser_mapping(open(out_fp+"/denoiser_mapping.txt"))
        combined_mapping.update(this_rounds_mapping)
        result_centroids.append(MinimalFastaParser(open(out_fp+"/centroids.fasta")))
        result_singletons_files.append(out_fp+"/singletons.fasta")

    #write the combined files
    store_mapping(combined_mapping, tmpoutdir, "denoiser")
    seqs = chain(*result_centroids)
    fasta_fh = open(tmpoutdir+"/denoised.fasta", "w");    
    #write centroids sorted by clustersize
    write_Fasta_from_name_seq_pairs(sort_seqs_by_clustersize(seqs, combined_mapping),
                                    fasta_fh)
    for singleton_file in result_singletons_files:
        write_Fasta_from_name_seq_pairs(MinimalFastaParser(open(singleton_file,"r")),
                                       fasta_fh)
    fasta_fh.close()

    # return outdir for tests/test_denoiser
    return tmpoutdir
Пример #11
0
def compute_min_alignment_length(seqs_f, fraction=0.75):
    """ compute the min alignment length as n standard deviations below the mean """
    med_length = median([len(s) for _, s in MinimalFastaParser(seqs_f)])
    return int(med_length * fraction)
Пример #12
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = MinimalFastaParser(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in MinimalFastaParser(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        try:
            template_alignment = LoadSeqs(data=template_alignment,
                                          moltype=DNA,
                                          aligned=DenseAlignment)
        except KeyError as e:
            raise KeyError(
                'Only ACGT-. characters can be contained in template alignments.'
                + ' The offending character was: %s' % e)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            for seq in pynast_failed:
                fail_file.write(seq.toFasta())
                fail_file.write('\n')
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            for seq in pynast_aligned:
                result_file.write(seq.toFasta())
                result_file.write('\n')
            result_file.close()
            return None
        else:
            try:
                return LoadSeqs(data=pynast_aligned, aligned=DenseAlignment)
            except ValueError:
                return {}
Пример #13
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 failure_path=None,
                 cmbuild_params=None,
                 cmalign_params=None):

        log_params = []
        # load candidate sequences
        candidate_sequences = dict(MinimalFastaParser(open(seq_path, 'U')))

        # load template sequences
        try:
            info, template_alignment, struct = list(
                MinimalRfamParser(open(self.Params['template_filepath'], 'U'),
                                  seq_constructor=ChangedSequence))[0]
        except RecordError:
            raise ValueError(
                "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner."
            )

        moltype = self.Params['moltype']

        # Need to make separate mapping for unaligned sequences
        unaligned = SequenceCollection(candidate_sequences, MolType=moltype)
        int_map, int_keys = unaligned.getIntMap(prefix='unaligned_')
        int_map = SequenceCollection(int_map, MolType=moltype)

        # Turn on --gapthresh option in cmbuild to force alignment to full
        # model
        if cmbuild_params is None:
            cmbuild_params = {}
        cmbuild_params.update({'--gapthresh': 1.0})

        # record cmbuild parameters
        log_params.append('cmbuild parameters:')
        log_params.append(str(cmbuild_params))

        # Turn on --sub option in Infernal, since we know the unaligned sequences
        # are fragments.
        # Also turn on --gapthresh to use same gapthresh as was used to build
        # model

        if cmalign_params is None:
            cmalign_params = {}
        cmalign_params.update({'--sub': True, '--gapthresh': 1.0})

        # record cmalign parameters
        log_params.append('cmalign parameters:')
        log_params.append(str(cmalign_params))

        # Align sequences to alignment including alignment gaps.
        aligned, struct_string = cmalign_from_alignment(
            aln=template_alignment,
            structure_string=struct,
            seqs=int_map,
            moltype=moltype,
            include_aln=True,
            params=cmalign_params,
            cmbuild_params=cmbuild_params)

        # Pull out original sequences from full alignment.
        infernal_aligned = {}
        aligned_dict = aligned.NamedSeqs
        for key in int_map.Names:
            infernal_aligned[int_keys.get(key, key)] = aligned_dict[key]

        # Create an Alignment object from alignment dict
        infernal_aligned = Alignment(infernal_aligned, MolType=moltype)

        if log_path is not None:
            log_file = open(log_path, 'w')
            log_file.write('\n'.join(log_params))
            log_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(infernal_aligned.toFasta())
            result_file.close()
            return None
        else:
            try:
                return infernal_aligned
            except ValueError:
                return {}
Пример #14
0
def apply_lane_mask_and_gap_filter(fastalines, lane_mask,\
    allowed_gap_frac=1-eps, verbose=False):
    """Applies lanemask and gap filter to fasta file, yielding filtered seqs.
    """

    if lane_mask:
        # convert lane_mask to a numpy index array
        p = mask_to_positions(lane_mask)

        # special case: lanemask is all zeros
        if sum(p) == 0:
            for line in fastalines:
                if line.startswith(">"):
                    yield line + '\n'
                else:
                    yield '\n'
            return

    # random temporary file for first-pass results
    tmpfilename = "/tmp/" + "".join(sample(lowercase, 20)) + ".tmp"
    try:
        tmpfile = open(tmpfilename, 'w')
    except IOError:
        raise IOError, "Can't open temporary file for writing: %s" %\
          tmpfilename

    # the number of gaps seen in each position (length may be unknown here)
    gapcounts = None

    # First pass: apply filter, and track gaps
    if verbose: print "First pass: applying lanemask..."
    seq_count = 0
    for k, v in MinimalFastaParser(fastalines):
        seq_count += 1
        # print progress in verbose mode
        if verbose and (seq_count % 100) == 0: status(seq_count)

        # apply lanemask if there is one
        if lane_mask:
            masked = get_masked_string(v, p)
        else:
            masked = v

        # initialize gapcount array to proper length
        if gapcounts == None:
            gapcounts = zeros(len(masked))

        # increment gap counts if requested
        if allowed_gap_frac < 1:
            gapcounts[find_gaps(masked)] += 1

        # write masked sequence to temporary file
        tmpfile.write('>%s\n%s\n' % (k, masked))
    if verbose:
        print
        print
    tmpfile.close()
    tmpfile = open(tmpfilename, 'U')

    # if we're not removing gaps, we're done; yield the temp file contents
    if allowed_gap_frac == 1:
        for line in tmpfile:
            yield line

    # else we are removing gaps; do second pass
    else:

        # convert gapcounts to true/false mask
        gapcounts = (gapcounts / float(seq_count)) <= allowed_gap_frac

        # Second pass: remove all-gap positions
        if verbose: print "Second pass: remove all-gap positions..."
        seq_count = 0
        for k, v in MinimalFastaParser(tmpfile):
            seq_count += 1
            # print progress in verbose mode
            if verbose and (seq_count % 100) == 0: status(seq_count)

            masked = get_masked_string(v, gapcounts)
            yield '>%s\n' % (k)
            yield '%s\n' % (masked)
        if verbose: print

    # delete temporary file
    tmpfile.close()
    remove(tmpfilename)
Пример #15
0
                        help="Number of CPUs \
        (Default 1)")

    args = parser.parse_args()
    basefolder = args.f
    if basefolder[-1] != "/":
        basefolder += "/"
    knownfile = args.i
    rnd = args.r
    if args.c < 1:
        raise ValueError("CPUs must be greater than 0!")

    knownfile = open(knownfile, 'U')
    knownseqs = []
    #build the storage vector holding known seq and output file for seq
    for header, seq in MinimalFastaParser(knownfile):
        seqfile = open(basefolder + header + ".fasta", 'w')
        seqfile.write(">%s\n%s\n" % (header, seq))
        knownseqs.append((seq, seqfile))
    knownfile.close()

    #loop over all the rounds to find sequence matches
    for currrnd in range(1, rnd + 1):
        rndname = "R" + str(currrnd)
        print rndname
        #print round info to each output file
        for info in knownseqs:
            info[1].write(">%s\n%s\n" % (rndname, rndname))
        #multiprocess each round
        manager = Manager()
        hold = manager.dict()
Пример #16
0
def usearch61_chimera_check(input_seqs_fp,
                            output_dir,
                            reference_seqs_fp=None,
                            suppress_usearch61_intermediates=False,
                            suppress_usearch61_ref=False,
                            suppress_usearch61_denovo=False,
                            split_by_sampleid=False,
                            non_chimeras_retention="union",
                            usearch61_minh=0.28,
                            usearch61_xn=8.0,
                            usearch61_dn=1.4,
                            usearch61_mindiffs=3,
                            usearch61_mindiv=0.8,
                            usearch61_abundance_skew=2.0,
                            percent_id_usearch61=0.97,
                            minlen=64,
                            word_length=8,
                            max_accepts=1,
                            max_rejects=8,
                            verbose=False,
                            HALT_EXEC=False):
    """ Main convenience function for usearch61 chimera checking
    
    input_seqs_fp:  filepath of input fasta file.
    output_dir:  output directory
    reference_seqs_fp: fasta filepath for reference chimera detection.
    suppress_usearch61_intermediates:  Suppress retention of .uc and log files.
    suppress_usearch61_ref:  Suppress usearch61 reference chimera detection.
    suppress_usearch61_denovo:  Suppress usearch61 de novo chimera detection.
    split_by_sampleid:  Split by sample ID for de novo chimera detection.
    non_chimeras_retention: Set to "union" or "intersection" to retain 
     non-chimeras between de novo and reference based results.
    usearch61_minh: Minimum score (h) to be classified as chimera. 
     Increasing this value tends to the number of false positives (and also
     sensitivity).
    usearch61_xn:  Weight of "no" vote.  Increasing this value tends to the
     number of false positives (and also sensitivity).
    usearch61_dn:  Pseudo-count prior for "no" votes. (n). Increasing this 
     value tends to the number of false positives (and also sensitivity).
    usearch61_mindiffs:  Minimum number of diffs in a segment. Increasing this
     value tends to reduce the number of false positives while reducing 
     sensitivity to very low-divergence chimeras.
    usearch61_mindiv:  Minimum divergence, i.e. 100% - identity between the 
     query and closest reference database sequence. Expressed as a percentage,
     so the default is 0.8%, which allows chimeras that are up to 99.2% similar
     to a reference sequence.
    usearch61_abundance_skew: abundance skew for de novo chimera comparisons.
    percent_id_usearch61: identity to cluster sequences at
    minlen: minimum sequence length for use with usearch61
    word_length: length of nucleotide 'words' for usearch61
    max_accepts: max number of accepts for hits with usearch61
    max_rejects: max number of rejects for usearch61, increasing allows more
     sensitivity at a cost of speed
    HALT_EXEC=application controller option to halt execution and print command
    """
    """ 
    Need to cluster sequences de novo first to get 1. abundance information
    and 2 consensus sequence for each cluster.  Using dereplication followed
    by clustering does not appear to automatically update complete cluster 
    size, will directly cluster raw seqs with the small_mem clustering option.
    
    This means without additional parsing steps to recalculate 
    actual cluster sizes, the sizeorder option can't be used for de novo
    clustering and downstream chimera detection."""

    files_to_remove = []

    # Get absolute paths to avoid issues with calling usearch
    input_seqs_fp = abspath(input_seqs_fp)
    output_dir = abspath(output_dir)
    if reference_seqs_fp:
        reference_seqs_fp = abspath(reference_seqs_fp)
    log_fp = join(output_dir, "identify_chimeric_seqs.log")
    chimeras_fp = join(output_dir, "chimeras.txt")
    non_chimeras_fp = join(output_dir, "non_chimeras.txt")

    non_chimeras = []
    chimeras = []
    log_lines = {
        'denovo_chimeras': 0,
        'denovo_non_chimeras': 0,
        'ref_chimeras': 0,
        'ref_non_chimeras': 0
    }

    if split_by_sampleid:
        if verbose:
            print "Splitting fasta according to SampleID..."
        full_seqs = open(input_seqs_fp, "U")
        sep_fastas =\
         split_fasta_on_sample_ids_to_files(MinimalFastaParser(full_seqs),
         output_dir)
        full_seqs.close()

        if suppress_usearch61_intermediates:
            files_to_remove += sep_fastas

        for curr_fasta in sep_fastas:
            curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\
             identify_chimeras_usearch61(curr_fasta, output_dir,
             reference_seqs_fp, suppress_usearch61_intermediates,
             suppress_usearch61_ref, suppress_usearch61_denovo,
             non_chimeras_retention, usearch61_minh, usearch61_xn,
             usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
             usearch61_abundance_skew, percent_id_usearch61, minlen,
             word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
             log_lines, verbose)

            chimeras += curr_chimeras
            non_chimeras += curr_non_chimeras

    else:
        chimeras, non_chimeras, files_to_remove, log_lines =\
         identify_chimeras_usearch61(input_seqs_fp, output_dir,
         reference_seqs_fp, suppress_usearch61_intermediates,
         suppress_usearch61_ref, suppress_usearch61_denovo,
         non_chimeras_retention, usearch61_minh, usearch61_xn,
         usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
         usearch61_abundance_skew, percent_id_usearch61, minlen,
         word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
         log_lines, verbose)

    # write log, non chimeras, chimeras.
    write_usearch61_log(
        log_fp, input_seqs_fp, output_dir, reference_seqs_fp,
        suppress_usearch61_intermediates, suppress_usearch61_ref,
        suppress_usearch61_denovo, split_by_sampleid, non_chimeras_retention,
        usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs,
        usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61,
        minlen, word_length, max_accepts, max_rejects, HALT_EXEC, log_lines)

    chimeras_f = open(chimeras_fp, "w")
    non_chimeras_f = open(non_chimeras_fp, "w")
    for curr_chimera in chimeras:
        chimeras_f.write("%s\n" % curr_chimera)
    for curr_non_chimera in non_chimeras:
        non_chimeras_f.write("%s\n" % curr_non_chimera)
    chimeras_f.close()
    non_chimeras_f.close()

    remove_files(files_to_remove)
Пример #17
0
    def __call__(self,
                 seq_path,
                 otu_path,
                 result_path=None,
                 log_path=None,
                 sort_by='otu'):
        """Returns dict mapping {otu_id:[seq_ids]} for each otu.
        
        Parameters:
        seq_path: path to file of sequences
        otu_path: path to file of OTUs
        result_path: path to file of results. If specified,
        dumps the result to the desired path instead of returning it.
        log_path: path to log, which includes dump of params.
        sort_by: sort by otu or seq_id
        """
        # Load the seq path. We may want to change that in the future
        # to avoid the overhead of loading large sequence collections
        # during this step.
        seq_f = open(seq_path, 'U')
        seqs = dict(MinimalFastaParser(seq_f, label_to_name=label_to_name))
        seq_f.close()

        #Load the otu file
        otu_f = open(otu_path, 'U')
        otus = fields_to_dict(otu_f)
        otu_f.close()

        if self.Params['ChoiceFRequiresSeqs']:
            choice_f = self.Params['ChoiceF'](seqs)
        else:
            choice_f = self.Params['ChoiceF']

        #actually pick the set
        result = {}
        for set_id, ids in otus.items():
            result[set_id] = choice_f(ids, seqs)

        if result_path:
            # if the user provided a result_path, write the
            # results to file with one tab-separated line per
            # cluster
            of = open(result_path, 'w')
            if sort_by == 'seq_id':

                def key(s):
                    try:
                        return int(s[1].split('_', 1)[-1])
                    except ValueError:
                        return s
            else:
                key = lambda s: s
            for cluster, id_ in sorted(result.items(), key=key):
                of.write('>%s %s\n%s\n' % (cluster, id_, seqs[id_]))
            of.close()
            result = None
            log_str = 'Result path: %s' % result_path
        else:
            # if the user did not provide a result_path, store
            # the result in a dict of {otu_id: rep_id},
            log_str = 'Result path: None, returned as dict.'

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, 'w')
            log_file.write(str(self))
            log_file.write('\n')
            log_file.write('%s\n' % log_str)

        # return the result (note this is None if the data was
        # written to file)
        return result
Пример #18
0
def assign_seqs(file_data,
                ids_bcs_added_field,
                bc_lens,
                all_bcs,
                keep_barcode=False,
                barcode_type="golay_12",
                max_bc_errors=1.5,
                start_index=1,
                write_unassigned_reads=False,
                disable_bc_correction=False,
                added_demultiplex_field=None):
    """ Demultiplexes, writes seqs/qual files, returns log data

    file_data:  dict of open file objects, contains input fasta, qual, and
     mapping files, and output filepaths for partially demultiplexed fasta
     and qual files, and unassigned sequence output file.
    ids_bcs_added_field: dict of (barcode,added_demultiplex): SampleID
    bc_lens:  Lengths of all barcodes from largest to smallest.
    all_bcs:  List of all barcode sequences.
    keep_barcode:  If True, will not remove barcode from output files.
    barcode_type:  Specified barcode, can be golay_12, hamming_8,
     variable_length, or an integer specifying length.
    max_bc_errors:  Number of changes allowed for error correcting barcodes,
     for generic barcodes, specifies the number of mismatches allowed.
    start_index:  Specifies the first number used to enumerate output sequences.
    write_unassigned_reads:  If True, will write sequences that could not be
     demultiplexed into a separate output file.
    disable_bc_correction:  Only tests for exact matches to barcodes.
    added_demultiplex_field:  Uses data supplied in metadata mapping field
     and demultiplexes according to data in fasta labels.
    save_barcode_frequencies:  Saves the frequencies of barcode sequences in
     a separate output file.
    """

    log_data = initialize_log_data(ids_bcs_added_field)
    bc_freqs = defaultdict(int)

    seq_counts = 0
    enum_val = start_index
    corrected_bc_count = [0, 0]

    if file_data['qual_files']:
        for curr_fasta, curr_qual in zip(file_data['fasta_files'],
                                         file_data['qual_files']):
            for fasta_data, qual_data in izip(
                    MinimalFastaParser(curr_fasta),
                    MinimalQualParser(curr_qual, full_header=True)):

                seq_counts += 1
                fasta_label, fasta_seq = fasta_data
                qual_label, qual_seq = qual_data

                bc, corrected_bc, num_errors, added_field =\
                    get_demultiplex_data(ids_bcs_added_field,
                                         fasta_label, fasta_seq, bc_lens, all_bcs, barcode_type,
                                         max_bc_errors, disable_bc_correction, added_demultiplex_field)

                bc_freqs[bc] += 1

                sample_id, log_id, bc_corrected_result =\
                    get_output_ids(ids_bcs_added_field,
                                   corrected_bc, num_errors, added_field, max_bc_errors,
                                   enum_val)
                if bc_corrected_result == 'corrected':
                    corrected_bc_count[0] += 1
                if bc_corrected_result == 'not_corrected':
                    corrected_bc_count[1] += 1

                label_line = get_label_line(sample_id, fasta_label, bc,
                                            corrected_bc, num_errors)

                if sample_id.startswith("Unassigned") and\
                        write_unassigned_reads:
                    write_fasta_line(file_data['unassigned_seqs_f'], fasta_seq,
                                     label_line, True, len(bc))
                    write_qual_line(file_data['unassigned_qual_f'],
                                    list(qual_seq), label_line, True, len(bc))
                elif not sample_id.startswith("Unassigned"):
                    write_fasta_line(file_data['demultiplexed_seqs_f'],
                                     fasta_seq, label_line, keep_barcode,
                                     len(bc))
                    write_qual_line(file_data['demultiplexed_qual_f'],
                                    list(qual_seq), label_line, keep_barcode,
                                    len(bc))

                if log_id:
                    log_data[log_id] += 1

                enum_val += 1

    else:
        for curr_fasta in file_data['fasta_files']:
            for fasta_label, fasta_seq in MinimalFastaParser(curr_fasta):
                seq_counts += 1
                bc, corrected_bc, num_errors, added_field =\
                    get_demultiplex_data(ids_bcs_added_field,
                                         fasta_label, fasta_seq, bc_lens, all_bcs, barcode_type,
                                         max_bc_errors, disable_bc_correction, added_demultiplex_field)

                bc_freqs[bc] += 1

                sample_id, log_id, bc_corrected_result =\
                    get_output_ids(ids_bcs_added_field,
                                   corrected_bc, num_errors, added_field, max_bc_errors,
                                   enum_val)

                if bc_corrected_result == 'corrected':
                    corrected_bc_count[0] += 1
                if bc_corrected_result == 'not_corrected':
                    corrected_bc_count[1] += 1

                label_line = get_label_line(sample_id, fasta_label, bc,
                                            corrected_bc, num_errors)

                if sample_id.startswith("Unassigned") and\
                        write_unassigned_reads:
                    write_fasta_line(file_data['unassigned_seqs_f'], fasta_seq,
                                     label_line, True, len(bc))
                elif not sample_id.startswith("Unassigned"):
                    write_fasta_line(file_data['demultiplexed_seqs_f'],
                                     fasta_seq, label_line, keep_barcode,
                                     len(bc))

                if log_id:
                    log_data[log_id] += 1

                enum_val += 1

    return log_data, bc_freqs, seq_counts, corrected_bc_count
Пример #19
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    min_count = opts.min_count
    max_count = opts.max_count
    min_count_fraction = opts.min_count_fraction
    if min_count_fraction < 0. or min_count_fraction > 1.:
        option_parser.error("min_count_fraction must be between 0 and 1")
    if min_count != 0 and min_count_fraction != 0:
        option_parser.error(
            "cannot specify both min_count and min_count_fraction")

    min_samples = opts.min_samples
    max_samples = opts.max_samples

    otu_ids_to_exclude_fp = opts.otu_ids_to_exclude_fp
    negate_ids_to_exclude = opts.negate_ids_to_exclude

    if not (min_count != 0 or \
            min_count_fraction != 0 or \
            not isinf(max_count) or \
            otu_ids_to_exclude_fp != None or \
            min_samples !=0 or not isinf(max_samples)):
        option_parser.error(
            "No filtering requested. Must provide either "
            "min counts, max counts, min samples, max samples, min_count_fraction, "
            "or exclude_fp (or some combination of those).")

    otu_table = parse_biom_table(open(opts.input_fp, 'U'))

    if min_count_fraction > 0:
        min_count = otu_table.sum() * min_count_fraction
        print otu_table.sum(), min_count

    output_f = open(opts.output_fp, 'w')

    otu_ids_to_keep = set(otu_table.ObservationIds)

    if otu_ids_to_exclude_fp:
        if otu_ids_to_exclude_fp.endswith('.fasta') or \
           otu_ids_to_exclude_fp.endswith('.fna'):
            otu_ids_to_exclude = set([
                id_.strip().split()[0] for id_, seq in MinimalFastaParser(
                    open(otu_ids_to_exclude_fp, 'U'))
            ])
        else:
            otu_ids_to_exclude = set([
                l.strip().split('\t')[0]
                for l in open(otu_ids_to_exclude_fp, 'U')
            ])

        otu_ids_to_keep -= otu_ids_to_exclude

    filtered_otu_table = filter_otus_from_otu_table(otu_table, otu_ids_to_keep,
                                                    min_count, max_count,
                                                    min_samples, max_samples,
                                                    negate_ids_to_exclude)
    output_f.write(format_biom_table(filtered_otu_table))
    output_f.close()
Пример #20
0
def get_seqs_to_keep_lookup_from_fasta_file(fasta_f):
    """return the sequence ids within the fasta file"""
    return set([seq_id.split()[0] for seq_id,seq in MinimalFastaParser(fasta_f)])
Пример #21
0
 def test_empty(self):
     """MinimalFastaParser should return empty list from 'file' w/o labels"""
     self.assertEqual(list(MinimalFastaParser(self.empty)), [])
     self.assertEqual(list(MinimalFastaParser(self.nolabels, strict=False)),
                      [])
     self.assertRaises(RecordError, list, MinimalFastaParser(self.nolabels))
Пример #22
0
    if args.c < 1:
        raise ValueError("ERROR: CPU count must be at least 1!")
    if args.sim <= 0.0 or args.sim > 1.0:
        raise ValueError("ERROR: clustering simmilarity must be > 0 and <= 1!")
    clustscore = args.csc
    outfolder = args.o.strip()
    basefolder = args.f.strip()

    if not exists(basefolder):
        raise IOError("Basefolder does not exist!")

    # calculate minseqs if necessary
    if args.minseqs == -1:
        with open(args.i) as fin:
            args.minseqs = int(
                count_seqs([h for h, s in MinimalFastaParser(fin)]) * 0.001)

    if basefolder[:-1] != "/":
        basefolder += "/"
    if outfolder[:-1] != "/":
        outfolder += "/"
    if not exists(outfolder):
        mkdir(outfolder)

    date = str(datetime.now())
    print "Program started ", date

    # print out run info to a file
    infofile = open(outfolder + "runparams.txt", 'w')
    infofile.write(''.join([
        "Program started ", date, "\n", "FASTA file:\t", args.i, "\n",
Пример #23
0
def get_chimeras_from_Nast_aligned(seqs_fp,
                                   ref_db_aligned_fp=None,
                                   ref_db_fasta_fp=None,
                                   HALT_EXEC=False,
                                   min_div_ratio=None,
                                   keep_intermediates=False):
    """remove chimeras from seqs_fp using chimeraSlayer.

    seqs_fp:  a filepath with the seqs to check in the file
    ref_db_aligned_fp: fp to (pynast) aligned reference sequences
    ref_db_fasta_fp: same seqs as above, just unaligned. Will be computed on the fly if not provided,
    HALT_EXEC: stop execution if true
    min_div_ratio: passed to ChimeraSlayer App
    """

    files_to_remove = []
    #might come in as FilePath object with quotes
    seqs_fp = str(seqs_fp)
    seqs_fp = seqs_fp.rstrip('"')
    seqs_fp = seqs_fp.lstrip('"')

    seqs_dir, new_seqs_fp = split(seqs_fp)

    #if fp is in current dir, we fake a dir change
    if seqs_dir == "":
        seqs_dir = "./"

    #Chimera Slayer puts some temp files in current dir and some in dir of input file
    #use exe_dir to change to dir of input file, so to have all tmp files in one place
    params = {'--query_NAST': new_seqs_fp, '--exec_dir': seqs_dir}

    if ref_db_aligned_fp == None and ref_db_fasta_fp == None:
        #use default db, whose relative position to the
        #ChimeraSlayer binary is hardcoded
        pass

    else:
        if not ref_db_fasta_fp:
            #make degapped reference file
            ref_db_fasta_fp = write_degapped_fasta_to_file(MinimalFastaParser( \
                    open(ref_db_aligned_fp)))
            files_to_remove.append(ref_db_fasta_fp)
        #use user db
        params.update({
            '--db_NAST': ref_db_aligned_fp,
            '--db_FASTA': ref_db_fasta_fp
        })

    if min_div_ratio != None:
        params.update({'-R': min_div_ratio})

    app = ChimeraSlayer(params=params, HALT_EXEC=HALT_EXEC)
    app_results = app()

    #    this is a FilePath object in case of success.
    #    How can we test for failure here?
    #    if not exists(app_results['CPS']):
    #         raise ApplicationError, "ChimeraSlayer failed. No output file."

    chimeras = parse_CPS_file((app_results['CPS']))
    if not keep_intermediates:
        app.remove_intermediate_files()
        remove_files(files_to_remove)

    return chimeras
Пример #24
0
 def setUp(self):
     self.seqs = Alignment(dict(MinimalFastaParser(test_seqs.split())))
Пример #25
0
def preprocess(sff_fps, log_fh, fasta_fp=None, out_fp="/tmp/",
               verbose=False, squeeze=False,
               primer=STANDARD_BACTERIAL_PRIMER):
    """Quality filtering and truncation of flowgrams, followed by denoiser phase I.

    sff_fps: List of paths to flowgram files

    log_fh: log messages are written to log_fh if it is set to something else than None

    fasta_fp: Path to fasta file, formatted as from split_libraries.py.
              This files is used to filter the flowgrams in sff_fps. Only reads in
              fasta_fp are pulled from sff_fps.

    out_fp: path to output directory

    verbose: a binary verbose flag

    squeeze: a flag that controls if sequences are squeezed before phase I.
             Squeezing means consecutive identical nucs are collapsed to one.

    primer: The primer sequences of the amplification process. This seq will be
            removed from all reads during the preprocessing
    """
    flowgrams, header = cat_sff_files(map(open, sff_fps))

    if(fasta_fp):
        # remove barcodes and sequences tossed by split_libraries, i.e. not in
        # fasta_fp
        labels = imap(lambda a_b: a_b[0], MinimalFastaParser(open(fasta_fp)))
        barcode_mapping = extract_barcodes_from_mapping(labels)
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header,
                                                      outdir=out_fp,
                                                      barcode_mapping=barcode_mapping,
                                                      primer=primer)
        if verbose:
            log_fh.write(
                "Sequences in barcode mapping: %d\n" %
                len(barcode_mapping))
            log_fh.write("Truncated flowgrams written: %d\n" % l)
    else:
        # just do a simple clean and truncate
        (clean_sff_fp, l) = cleanup_sff(flowgrams, header, outdir=out_fp)
        if verbose:
            log_fh.write("Cleaned flowgrams written: %d\n" % l)
        flowgrams, header = lazy_parse_sff_handle(open(clean_sff_fp))
        (trunc_sff_fp, l) = truncate_flowgrams_in_SFF(flowgrams, header,
                                                      outdir=out_fp, primer=primer)
        if verbose:
            log_fh.write("Truncated flowgrams written: %d\n" % l)
        remove(clean_sff_fp)

    if (l == 0):
        raise ValueError("No flowgrams left after preprocesing.\n" +
                         "Check your primer sequence")

    # Phase I - cluster seqs which are exact prefixe
    if verbose:
        log_fh.write("Filter flowgrams by prefix matching\n")

    (flowgrams, header) = lazy_parse_sff_handle(open(trunc_sff_fp))
    l, orig_l, mapping =\
        prefix_filter_flowgrams(flowgrams, squeeze=squeeze)

    averaged_sff_fp, seqs = build_averaged_flowgrams(mapping, trunc_sff_fp,
                                                     min_coverage=1,
                                                     # averaging produces too good flowgrams
                                                     # such that the greedy clustering clusters too much.
                                                     # Use the cluster centroid
                                                     # instead by using
                                                     # min_coverage 1
                                                     out_fp=out_fp + "/prefix_dereplicated.sff.txt")
    remove(trunc_sff_fp)
    if verbose:
        log_fh.write("Prefix matching: removed %d out of %d seqs\n"
                     % (orig_l - l, orig_l))
        log_fh.write("Remaining number of sequences: %d\n" % l)
        log_fh.write(make_stats(mapping) + "\n")

    # print representative sequences and mapping
    print_rep_seqs(mapping, seqs, out_fp)
    store_mapping(mapping, out_fp, "prefix")
    return (averaged_sff_fp, l, mapping, seqs)
Пример #26
0
def assign_taxonomy(data,
                    min_confidence=0.80,
                    output_fp=None,
                    training_data_fp=None,
                    max_memory=None):
    """ Assign taxonomy to each sequence in data with the RDP classifier 
    
        data: open fasta file object or list of fasta lines
        confidence: minimum support threshold to assign taxonomy to a sequence
        output_fp: path to write output; if not provided, result will be 
         returned in a dict of {seq_id:(taxonomy_assignment,confidence)}
    
    """
    data = list(data)

    # build a map of seq identifiers as the RDP classifier doesn't
    # preserve these perfectly
    identifier_lookup = {}
    for seq_id, seq in MinimalFastaParser(data):
        identifier_lookup[seq_id.split()[0]] = seq_id

    # build the classifier object
    app = RdpClassifier20()
    if max_memory is not None:
        app.Parameters['-Xmx'].on(max_memory)
    if training_data_fp is not None:
        app.Parameters['-training-data'].on(training_data_fp)

    # apply the rdp app controller
    rdp_result = app('\n'.join(data))
    # grab assignment output
    result_lines = rdp_result['Assignments']

    # start a list to store the assignments
    results = {}

    # ShortSequenceException messages are written to stdout
    # Tag these ID's as unassignable
    stdout_lines = rdp_result['StdOut']
    for line in stdout_lines:
        if line.startswith('ShortSequenceException'):
            matchobj = re.search('recordID=(\S+)', line)
            if matchobj:
                rdp_id = matchobj.group(1)
                orig_id = identifier_lookup[rdp_id]
                results[orig_id] = ('Unassignable', 1.0)

    # iterate over the identifier, assignment strings (this is a bit
    # of an abuse of the MinimalFastaParser, as these are not truely
    # fasta lines)
    for identifier, assignment_str in MinimalFastaParser(result_lines):
        # get the original identifier from the one in the rdp result
        identifier = identifier_lookup[\
         identifier[:identifier.index('reverse=')].strip()]
        # build a list to store the assignments we're confident in
        # (i.e., the ones that have a confidence greater than min_confidence)
        confident_assignments = []
        # keep track of the lowest acceptable confidence value that
        # has been encountered
        lowest_confidence = 0.0

        # split the taxonomy assignment string
        assignment_fields = assignment_str.split(';')
        # iterate over (assignment, assignment confidence) pairs
        for i in range(0, len(assignment_fields), 2):
            assignment = assignment_fields[i]
            try:
                assignment_confidence = float(assignment_fields[i + 1])
            except IndexError:
                break
            # check the confidence of the current assignment
            if assignment_confidence >= min_confidence:
                # if the current assignment confidence is greater than
                # the min, store the assignment and confidence value
                confident_assignments.append(assignment.strip())
                lowest_confidence = assignment_confidence
            else:
                # otherwise, we've made it to the lowest assignment that
                # met the confidence threshold, so bail out of the loop
                break

        # store the identifier, the semi-colon-separated assignments, and the
        # confidence for the last assignment
        results[identifier] = \
             (';'.join(confident_assignments),lowest_confidence)

    if output_fp:
        try:
            output_file = open(output_fp, 'w')
        except OSError:
            raise OSError, "Can't open output file for writing: %s" % output_fp

        for seq_id, values in results.items():
            output_file.write('%s\t%s\t%1.3f\n' %
                              (seq_id, values[0], values[1]))

        output_file.close()
        return None
    else:
        return results
Пример #27
0
from sys import argv
from cogent.parse.fasta import MinimalFastaParser

silva_taxa = open(argv[1], "U")

id_to_taxa = {}

for line in silva_taxa:
    curr_id = line.split()[0].strip()
    curr_taxa = " ".join(line.split()[1:]).strip()
    id_to_taxa[curr_id] = curr_taxa

rep_set_fasta = open(argv[2], "U")

ordered_ids = []
for label,seq in MinimalFastaParser(rep_set_fasta):
    ordered_ids.append(label)

ordered_ids = set(ordered_ids)

otu_mapping = open(argv[3], "U")

matched_ids = {}

for line in otu_mapping:
    if len(line.strip()) == 0:
        continue
    curr_line = line.strip().split('\t')
    curr_otu = curr_line[0]
    all_seqs = curr_line[1:]
    for seq in all_seqs:
Пример #28
0
def parse_and_submit_params(key,project_id,seq_file,output_dir,\
                            submit_to_server=True):
    '''This function takes the input options from the user and generates a url
       and request header for submitting to the MG-RAST cgi script'''

    #Verify that the users computer can connect to the internet
    try:
        check_internet=urlopen('http://www.google.com')
    except:
        raise OSError, "This script is having trouble connecting to the internet!"

    #parse and split fasta file into individual sample fastas
    fasta_file=MinimalFastaParser(open(seq_file))
    split_fasta_on_sample_ids_to_files(fasta_file,output_dir)
    
    #set the MG-RAST link for QIIME
    host = 'metagenomics.anl.gov'

    #open the log html
    log_file=open(os.path.join(output_dir,'log.html'),'w')
    log_data=['<h3>The following jobs were submitted to MG-RAST.</h3>']
    log_data.append('<table border=1><tr><th>Fasta File</th><th>Job ID</th>')
    log_data.append('<th>md5</th></tr>')
    num=0
    #iterate over the fasta files in the given directory
    fasta_filepaths = glob('%s/*.fasta' % output_dir)
    fasta_filepaths.sort()
    for i in fasta_filepaths:
        
        #Get the sample id from the fasta filename
        sample_id=os.path.split(os.path.splitext(i)[0])[-1]
        
        #set the parameters
        params=[('key', key), ('sample', sample_id), ('project', project_id)]
        
        #get the full path and short name for the fasta file to be uploaded
        file_to_submit=os.path.abspath(i)
        fasta_shortname=os.path.split(file_to_submit)[-1]
        
        #open and read file to be put in post form
        file_object=open(file_to_submit).read()
        
        #set the file
        files=[('file',fasta_shortname,file_object)]
        
        #Post the file and parameters
        response = post_multipart(host,params,files,submit_to_server)
        
        #check the response for MG-RAST errors
        job=re.findall(r'<id>.*</id>',response)
        md5=re.findall(r'<md5>.*</md5>',response)
        
        #if job successful write to log html otherwise post an error message 
        #in the log file
        if job and md5:
            job_id=job[0].strip('<id>').strip('</id>')
            md5_id=md5[0].strip('<md5>').strip('</md5>')
            log_data.append('<tr><td>%s</td><td>%s</td><td>%s</td></tr>' % \
                                    (fasta_shortname,job_id,md5_id))
        else:
            response_error=re.findall(r'Can\'t call method "login" ',response)
            if response_error:
                log_data.append('</table><br><h3 style="color:red">')
                log_data.append('Web-service authorization key is not valid!')
                log_data.append('</h3>')
            else:
                log_data.append('</table><br><h3 style="color:red">%s</h3>' % \
                                    (response))
    
    log_data.append('</table>')
    
    log_info='\n'.join(log_data)
    #write and close the log html
    log_file.write(log_html % (log_info))
    log_file.close()

    return log_info
Пример #29
0
def check_fasta_seqs(input_fasta_fp,
                     barcodes,
                     linkerprimerseqs,
                     total_seq_count,
                     valid_chars=frozenset(
                         ['A', 'T', 'C', 'G', 'N', 'a', 't', 'c', 'g', 'n'])):
    """ Returns perc of seqs w/ invalid chars, barcodes, or primers present
    
    input_fasta_fp:  fasta filepath
    barcodes: set of barcodes from the mapping file
    linkerprimerseqs: set of linkerprimersequences from the mapping file
    total_seq_count: total number of sequences in fasta file
    valid_chars: Currently allowed DNA chars
    """

    input_fasta_f = open(input_fasta_fp, "U")

    invalid_chars_count = 0
    barcodes_count = 0
    linkerprimers_count = 0
    barcodes_at_start = 0

    # Get max barcode length to checking the beginning of seq for barcode
    max_bc_len = max([len(bc_len) for bc_len in barcodes])

    for label, seq in MinimalFastaParser(input_fasta_f):

        # Only count one offending problem
        for curr_nt in seq:
            if curr_nt not in valid_chars:
                invalid_chars_count += 1
                break

        sliced_seq = seq[0:max_bc_len]

        for curr_bc in barcodes:
            if curr_bc in sliced_seq:
                barcodes_at_start += 1
                break

        for curr_bc in barcodes:
            if curr_bc in seq:
                barcodes_count += 1
                break

        for curr_primer in linkerprimerseqs:
            if curr_primer in seq:
                linkerprimers_count += 1
                break

    invalid_chars_count = float(invalid_chars_count)
    barcodes_count = float(barcodes_count)
    linkerprimers_count = float(linkerprimers_count)
    total_seq_count = float(total_seq_count)
    barcodes_at_start_count = float(barcodes_at_start)

    perc_invalid_chars = "%1.3f" %\
     (invalid_chars_count/total_seq_count)
    perc_barcodes_detected = "%1.3f" %\
     (barcodes_count/total_seq_count)
    perc_primers_detected = "%1.3f" %\
     (linkerprimers_count/total_seq_count)
    perc_barcodes_at_start_detected = "%1.3f" %\
     (barcodes_at_start_count/total_seq_count)

    return perc_invalid_chars, perc_barcodes_detected, perc_primers_detected,\
     perc_barcodes_at_start_detected
Пример #30
0
        result['SS'] = ResultPath(Path=self.WorkingDir+'alirna.ps',\
            IsWritten=True)

        return result


def rnaalifold_from_alignment(aln, moltype=RNA, params=None):
    """Returns seq, pairs, folding energy for alignment.
    """
    #Create Alignment object.  Object will handle if seqs are unaligned.
    aln = Alignment(aln, MolType=RNA)
    int_map, int_keys = aln.getIntMap()

    app = RNAalifold(WorkingDir='/tmp',\
        InputHandler='_input_as_multiline_string',params=params)
    res = app(clustal_from_alignment(int_map))

    #seq,pairs,energy = rnaalifold_parser(res['StdOut'].readlines())
    pairs_list = MinimalRnaalifoldParser(res['StdOut'].readlines())

    res.cleanUp()
    return pairs_list


if __name__ == "__main__":
    from sys import argv
    aln_file = argv[1]
    aln = dict(MinimalFastaParser(open(aln_file, 'U')))
    res = rnaalifold_from_alignment(aln)
    print res
Пример #31
0
def convert_fastq(fasta_file_path,
                  qual_file_path,
                  output_directory='.',
                  multiple_output_files=False,
                  ascii_increment=33,
                  full_fastq=False,
                  full_fasta_headers=False):
    '''Takes a FASTA and QUAL file, generates FASTQ file(s)
    
    fasta_file_path:  filepath of input FASTA file.
    qual_file_path:  filepath of input QUAL file (needed for making FASTQ files)
    output_directory:  Directory to output converted files.
    multiple_output_files:  Make one file per SampleID.
    ascii_increment:  Conversion value for fastq ascii character to numeric
     quality score.
    full_fastq:  Write labels to both sequence and quality score lines.
    full_fasta_headers:  Retain all data on fasta label, instead of breaking at
     first whitespace.'''

    output_files = {}

    fasta_file = open(fasta_file_path, 'U')
    qual_file = open(qual_file_path, 'U')

    # Need to open file the first time as "w", thereafter open as "a"
    sample_ids_written = {}

    for fasta_data, qual_data in izip(MinimalFastaParser(fasta_file),
                                      MinimalQualParser(qual_file)):

        qual_header = qual_data[0]
        fasta_header = fasta_data[0]
        label = fasta_header.split()[0]
        sample_id = label.split('_')[0]
        sequence = fasta_data[1]
        qual = qual_data[1]
        try:
            quality_scores = qual_data[1]
        except KeyError:
            raise KeyError,("No entry in QUAL file for label: %s\n" % \
            label)

        if qual_header != label:
            raise KeyError,("Fasta(%s) and qual(%s) headers don't match" %\
            (label, qual_header))

        if len(qual) != len(sequence):
            raise KeyError,("Number of quality scores "+\
            "(%d) does not match number of positions (%d) for label: %s" %\
             (len(qual), len(sequence), label))

        if not multiple_output_files:
            output_file_path = path.join(output_directory, \
            path.splitext(path.split(fasta_file_path)[1])[0] + '.fastq')
            if output_file_path in sample_ids_written.keys():
                sample_ids_written[output_file_path] = True
            else:
                sample_ids_written[output_file_path] = False
            try:
                # Create new file if first time writing, else append
                if sample_ids_written[output_file_path]:
                    fastq_file = open(output_file_path, 'a')
                else:
                    fastq_file = open(output_file_path, 'w')
            except IOError:
                qual_file.close()
                fasta_file.close()
                raise IOError,("Could not open FASTQ file for writing: " \
                        + output_file_path + '\n')
            output_files[sample_id] = output_file_path

        if multiple_output_files:
            if sample_id not in output_files:
                output_file_path = path.join(output_directory, \
                        path.splitext(path.split(fasta_file_path)[1])[0] + \
                        '_' + sample_id + '.fastq')
                if output_file_path in sample_ids_written.keys():
                    sample_ids_written[output_file_path] = True
                else:
                    sample_ids_written[output_file_path] = False
                try:
                    # Create new file if first time writing, else append
                    if sample_ids_written[output_file_path]:
                        output_files[sample_id] = open(output_file_path, 'a')
                    else:
                        output_files[sample_id] = open(output_file_path, 'w')

                except IOError:
                    raise IOError,("Could not open FASTQ file for writing: " \
                            + output_file_path + '\n')
                output_files[sample_id] = output_file_path

        fastq_file = open(output_files[sample_id], 'a')

        if full_fasta_headers:
            fastq_sequence_header = fasta_header
        else:
            fastq_sequence_header = label

        if full_fastq:
            fastq_quality_header = fastq_sequence_header
        else:
            fastq_quality_header = ''

        #Writing to FASTQ file
        fastq_file.write('@' + fastq_sequence_header + '\n')
        fastq_file.write(sequence + '\n')
        fastq_file.write('+' + fastq_quality_header + '\n')
        qual_scores = list(qual)
        for qual_score in qual_scores:
            # increment the qual score by the asciiIncrement (default 33),
            # and print the corresponding character, which represents that
            # position's quality.
            qual_score += ascii_increment
            if qual_score < 32 or qual_score > 126:
                raise ValueError,("Cannot convert quality score to ASCII code"+\
                 " between 32 and 126: " + str(qual_score - ascii_increment) +\
                 "using ascii_increment = " + str(ascii_increment))
            fastq_file.write(chr(qual_score))
        fastq_file.write('\n')
        if multiple_output_files:
            fastq_file.close()
Пример #32
0
def process_silva(seqs, tax_out, seq_out):
    for label,seq in MinimalFastaParser(seqs):
        new_header,taxonomy = parse_label(label)
        fixed_seq = parse_seq(seq)
        tax_out.write(new_header + '\t' + taxonomy + '\n')
        seq_out.write('>' + new_header + '\n' + fixed_seq + '\n')