Exemplo n.º 1
0
 def test_parse_qual_scores(self):
     """qual_scores should return dict of {id:qual_scores}"""
     scores = StringIO('>x\n5 10 5\n12\n>y\n30 40')
     scores2 = StringIO('>a\n5 10 5\n12\n>b\n30 40')
     self.assertEqual(parse_qual_scores([scores, scores2]), {
         'x': [5, 10, 5, 12],
         'y': [30, 40],
         'a': [5, 10, 5, 12],
         'b': [30, 40]
     })
Exemplo n.º 2
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    in_fasta = open(opts.input_fasta_fp, 'U')
    quals = parse_qual_scores([open(f, 'U') for f in opts.qual_fps])
    if not opts.result_fp:
        opts.result_fp = opts.input_fasta_fp + '.fastq'

    if opts.split:
        make_fastq_multi(in_fasta, quals, opts.result_fp)
    else:
        make_fastq_single(in_fasta, quals, opts.result_fp)
Exemplo n.º 3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    in_fasta = open(opts.input_fasta_fp, 'U')
    quals = parse_qual_scores([open(f, 'U') for f in opts.qual_fps])
    if not opts.result_fp:
        opts.result_fp = opts.input_fasta_fp + '.fastq'

    if opts.split:
        make_fastq_multi(in_fasta, quals, opts.result_fp)
    else:
        make_fastq_single(in_fasta, quals, opts.result_fp)
Exemplo n.º 4
0
 def test_parse_qual_scores(self):
     """qual_scores should return dict of {id:qual_scores}"""
     scores = StringIO('>x\n5 10 5\n12\n>y\n30 40')
     scores2= StringIO('>a\n5 10 5\n12\n>b\n30 40')
     self.assertEqual(parse_qual_scores([scores, scores2]),
         {'x':[5,10,5,12],'y':[30,40],'a':[5,10,5,12],'b':[30,40]})
Exemplo n.º 5
0
def preprocess(fasta_files, qual_files, mapping_file, 
    barcode_type="golay_12",
    min_seq_len=200, max_seq_len=1000, min_qual_score=25, starting_ix=1,
    keep_primer=True, max_ambig=0, max_primer_mm=1, trim_seq_len=True,
    dir_prefix='.', max_bc_errors=2, max_homopolymer=4,remove_unassigned=False,
    keep_barcode=False, attempt_bc_correction=True, qual_score_window=0,
    disable_primers=False, reverse_primers='disable', record_qual_scores=False):
        

        
    """
    Preprocess barcoded libraries, e.g. from 454.

    Parameters:

    fasta_files: list of raw 454 fasta files, fasta format.
    
    qual_files: list of raw 454 qual file(s)
    
    mapping_file: mapping file with BarcodeSequence column containing valid 
    barcodes used in the 454 run 

    barcode_type: type of barcode, e.g. golay_12. Should appear in list of
    known barcode types.

    min_seq_len: minimum sequence length to allow.

    max_seq_len: maximum sequence length to allow.

    min_qual_score: minimum average qual score considered acceptaable.
    
    starting_ix: integer to start sample sequence numbering at.

    keep_primer: when True, will keep primer sequence, otherwise will strip it 

    keep_barcode: when True, will keep barcode sequence, otherwise will strip it 

    max_ambig: maximum number of ambiguous bases to allow in the read.

    max_primer_mm: maximum number of primer mismatches to allow.

    trim_seq_len: if True (default), calculates lengths after trimming.

    dir_prefix: prefix of directories to write files into.

    max_bc_errors: maximum number of barcode errors to allow in output seqs
    
    max_homopolymer: maximum number of a nucleotide that can be 
    repeated in a given sequence.
    
    remove_unassigned: If True (False default), will not write seqs to the
    output .fna file that have a valid barcode (by Golay or Hamming standard)
    but are not included in the input mapping file.
    
    attempt_bc_correction: (default True) will attempt to find nearest valid
    barcode.  Can be disabled to improve performance.
    
    disable_primers: (default False) Disables testing for primers in the
    input mapping file and primer testing in the input sequence files.
    
    reverse_primers: (default 'disable') Enables removal of reverse primers and
    any subsequence sequence data from output reads.  Reverse primers have to
    be in 5'->3' format and in correct IUPAC codes in a column "ReversePrimer"
    in the input mapping file.  Run check_id_map to make test primers in this
    column for valid formatting.  The primers read from this column will be
    reverse complemented and associated with the given barcode in the
    mapping file.  If set to 'truncate_only', sequences where primers are found
    will be truncated, sequences where the primer is not found will be written
    unchanged.  If set to 'truncate_remove', sequences where primers are found
    will be truncated, sequences where the primer is not found will not be 
    written and counted in the log file as failing for this reason.  The 
    mismatches allowed for a reverse primer match are the same as specified 
    for the forward primer mismatches with the -M parameter (default 0).
    
    record_qual_scores:  (default False) Will record quality scores for all
    sequences that are written to the output seqs.fna file in a separate
    file (seqs_filtered.qual) containing the same sequence IDs and 
    quality scores for all bases found in the seqs.fna file.

    Result:
    in dir_prefix, writes the following files:
    id_map.xls: 2-column tab-delimited text format orig_id:new_id
    error_map.xls: 2-column tab-delimited text format orig_id:fail_reasons
    seqs.fasta: sequences with new ids lib_index in fasta format
    lengths.xls: histograms of unfiltered and filtered lengths, resolution 10 bp
    """

    
    if max_seq_len < 10:
        raise ValueError, "Max sequence must be >= 10"
    if min_seq_len >= max_seq_len:
        raise ValueError, "Min len cannot be >= max len"
    if min_qual_score < 0:
        raise ValueError, "Min qual score must be > 0"
    if starting_ix < 1:
        raise ValueError, "Starting index must be > 0."
    if max_ambig < 0:
        raise ValueError, "Max ambig chars must be >= 0."
    if max_primer_mm < 0:
        raise ValueError, "Max primer mismatches must be >= 0."
    if min_qual_score < 5:
        raise ValueError, "Min qual score must be >= 5."
    if reverse_primers not in ['disable','truncate_only','truncate_remove']:
        raise ValueError, ("reverse_primers parameter must be 'disable', "+\
         "truncate_only, or truncate_remove.")

    create_dir(dir_prefix, fail_on_exist=False)

#    try:
#        stat(dir_prefix)
#    except OSError:
#        mkdir(dir_prefix)

    """# Generate primer sequence patterns - changing to mapping file primers.
    all_primer_seqs, primer_seq_len = \
        get_primer_seqs(primer_seq_pats.split(',')) """
        
    # Check mapping file and get barcode mapping 
    map_file = open(mapping_file, 'U')
    headers, id_map, valid_map, warnings, errors, \
     primer_seqs_lens, all_primers = check_map(map_file, \
     disable_primer_check = disable_primers )
     
    if reverse_primers != 'disable':
        if 'ReversePrimer' not in headers:
            raise ValueError, ('To enable reverse primer check, there must '+\
             'be a "ReversePrimer" column in the mapping file with a reverse '+\
             'primer in each cell.')
        rev_primers = get_reverse_primers(id_map)
    else:
        rev_primers = False
        
    # *** Generate dictionary of {barcode: DNA.rc(ReversePrimer)}
    # First check for ReversePrimer in headers, raise error if not found
    # Implement local alignment for primer after barcode is determined.
    # Add option to flag seq with error for rev_primer not found
    # Check primer hit index, truncate sequence
    # unit tests.
    
    map_file.close()
    if errors:
        raise ValueError, "Invalid mapping file. "+\
        "Validate with check_id_map first: %s" % "\n".join(errors)

    # Find actual length of barcodes in the mapping file, also check for
    # variable lengths
    barcode_length_check = list(set([len(bc) for bc in valid_map]))
    # Check barcode type
    if barcode_type not in BARCODE_TYPES:
        try:
            barcode_len, barcode_fun = int(barcode_type), correct_barcode
        except ValueError:
            raise ValueError, "Unsupported barcode type: %s" % barcode_type
    else:
        barcode_len, barcode_fun = BARCODE_TYPES[barcode_type]


    # As people often do not specify a barcode that matches the lengths
    # of the barcodes used, a check on the actual barcode lengths needs to
    # be done, and an exception raised if they are variable length and not
    # specified as so.
    if barcode_type != "variable_length":
        # Raise error if variable length barcodes are present but not
        # specified
        if len(barcode_length_check) != 1:
            raise ValueError, ('Mapping file has variable length '+\
            'barcodes.  If this is intended, specifiy variable lengths '+\
            'with the -b variable_length option.')
        # Raise error if the specified barcode length doesn't match what
        # is present in the mapping file.
        if barcode_len != barcode_length_check[0]:
            raise ValueError, ('Barcode length detected in the mapping file, '+\
            ' %d does not match specified barcode length, %d.  ' % \
            (barcode_length_check[0], barcode_len) + 'To specify a barcode '+\
            'length use -b golay_12 or -b hamming_8 for 12 and 8 base pair '+\
            'golay or hamming codes respectively, or -b # where # is the '+\
            'length of the barcode used.  E.g. -b 4 for 4 base pair barcodes.')



    fasta_files = map(get_infile, fasta_files)
    qual_files = map(get_infile, qual_files)
    
    # Check fasta files valid format, no duplicate ids
    # and ids match between fasta and qual files
    all_fasta_ids = fasta_ids(fasta_files)
    all_qual_ids = fasta_ids(qual_files)
    if qual_files and (len(all_fasta_ids) != len(all_qual_ids)):
        f_ids = all_fasta_ids.difference(all_qual_ids)
        q_ids = all_qual_ids.difference(all_fasta_ids)
        raise ValueError, "Found %d ids in fasta file not in qual file, %d ids in qual file not in fasta"  % (len(f_ids), len(q_ids))

    
    for f in fasta_files:
        f.seek(0)
    if qual_files:
        for q in qual_files:
            q.seek(0)
        # Load quality scores 
        qual_mappings = parse_qual_scores(qual_files)
        for q in qual_files:
            q.close()
    else:
        qual_mappings = {}

    #make filters
    filters = []
    #seq len filter depends on whether we're including the barcode
    if trim_seq_len:
        # This processing occurs before primer testing, will use largest
        # primer length to calculate lengths.  the dict all_primers has
        # keys of each primer with the length of said primer as the value
        if not disable_primer_check:
            primer_seq_len = max(all_primers.values())
        else:
            # Set to zero if primers not used
            primer_seq_len = 0
        trim = barcode_len + primer_seq_len
        filters.append(SeqQualBad(
            'Length outside bounds of %s and %s' % (min_seq_len,max_seq_len),
            lambda id_, seq, qual: \
                not (min_seq_len<=len(seq)-trim<= max_seq_len)))
    else:
        filters.append(SeqQualBad(
            'Length outside bounds of %s and %s' % (min_seq_len,max_seq_len),
            lambda id_, seq, qual: not (min_seq_len<=len(seq)<= max_seq_len)))
    filters.append(SeqQualBad(
        'Num ambiguous bases exceeds limit of %s' % max_ambig,
        lambda id_, seq, qual: count_ambig(seq) > max_ambig))
    
    if qual_mappings:
        filters.append(QualMissing)
        filters.append(SeqQualBad(
            'Mean qual score below minimum of %s' % min_qual_score, 
            lambda id_, seq, qual: mean(qual) < min_qual_score))
    if qual_score_window:
            filters.append(SeqQualBad('Mean window qual score below '+\
                            'minimum of %s' % min_qual_score,
                                      lambda id_, seq, qual: \
             not check_window_qual_scores(qual, qual_score_window, \
             min_qual_score)))

    # Changed this to check entire sequence after barcode-could cause issue
    # if barcode-linker-primer have long homopolymers though.
    filters.append(SeqQualBad(
        'Max homopolymer run exceeds limit of %s' % max_homopolymer,
        lambda id_, seq, qual: seq_exceeds_homopolymers(
            seq[barcode_len:], max_homopolymer)))

    # Check seqs and write out
    fasta_out = open(dir_prefix + '/' + 'seqs.fna', 'w+')
    if record_qual_scores:
        qual_out = open(dir_prefix + '/' + 'seqs_filtered.qual', 'w+')
    else:
        qual_out = False
        
    '''log_stats, pre_lens, post_lens = check_seqs(fasta_out, fasta_files, 
        starting_ix, valid_map, qual_mappings, filters, barcode_len,
        primer_seq_len, keep_primer, keep_barcode, barcode_type, max_bc_errors,
        remove_unassigned) '''
    log_stats, pre_lens, post_lens = check_seqs(fasta_out, fasta_files, 
        starting_ix, valid_map, qual_mappings, filters, barcode_len,
        keep_primer, keep_barcode, barcode_type, max_bc_errors,
        remove_unassigned, attempt_bc_correction,
        primer_seqs_lens, all_primers, max_primer_mm, disable_primers,
        reverse_primers, rev_primers, qual_out)

    # Write log file
    log_file = open(dir_prefix + '/' + "split_library_log.txt", 'w+')
    log_file.write('\n'.join(log_stats))
    log_file.close()

    # Write sequence distros here
    histogram_file = open(dir_prefix + '/' + 'histograms.txt', 'w+')
    histogram_file.write(format_histograms
        (*make_histograms(pre_lens, post_lens)))
    histogram_file.close()