def fix_fasta_file(file, out_dir=None):
    '''
        Passes a file through biopython SeqIO to remove common
        formatting issues like '\r' characters and unwrapped sequences.
        The new file is saved with the suffix '_clean.fasta'.
    '''

    # Give up early if the file does not look like fasta
    assert check_header_pattern(file), "Sorry, " + str(file) + " does not look like FASTA to me"

    suffix = '_clean.fa';
    (out_path,out_basename,out_ext) = general.parse_filename(file)
    if out_dir is not None:
        os.system("mkdir -p "+out_dir);
        out_path = out_dir

    fixed_file = out_path + '/' + out_basename + suffix
    out_handle = general.open_write_file(fixed_file)
    fasta_in  = SeqIO.parse(file,'fasta');

    # Iterate through the records to remove white-space
    # from the ID line
    new_records = []
    for record in fasta_in:
        header = re.sub('\s+','_',record.description)
        record.id = header
        record.name = header
        record.description = ''
        new_records.append(record)

    written = SeqIO.write(new_records, out_handle,'fasta')

    print str(written) + ' sequence records stored in ' + fixed_file

    return(fixed_file)
示例#2
0
def fix_headers(fasta_file_name,
                qc_set_func,
                checked_qc_set_func,
                out_dir=None):
    '''
        Remove white spaces from the headers of a FASTA file. Fixed FASTA file
        is saved with the suffix '_h.fasta'.
    '''
    (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_header = out_path + '/' + out_basename + '_h.fasta'
    broken_fasta = general.open_file(fasta_file_name)
    fixed_fasta = general.open_write_file(file_with_header)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line = line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    remove_set = set(['header_whitespace'
                      ])  # Remove qc step becuase it will be corrected
    # in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set)  # skip finished repairs
    checked_qc_set_func = checked_qc_set_func.difference(
        remove_set)  # skip finished checks
    return (file_with_header, qc_set_func, checked_qc_set_func)
def fix_headers(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None):
    '''
        Remove white spaces from the headers of a FASTA file. Fixed FASTA file
        is saved with the suffix '_h.fasta'.
    '''
    (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_header = out_path + '/' +  out_basename + '_h.fasta'
    broken_fasta=general.open_file(fasta_file_name)
    fixed_fasta=general.open_write_file(file_with_header)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line=line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    remove_set = set(['header_whitespace']) # Remove qc step becuase it will be corrected
    # in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs
    checked_qc_set_func = checked_qc_set_func.difference(remove_set) # skip finished checks
    return(file_with_header, qc_set_func, checked_qc_set_func)
def fix_new_line(file, header_whitespace=False, out_dir=None):
    """
        Strips any new line character ('\\n' or '\\r') from each line in
        file and ends each line (including the last line) with a new 
        line character ('\\n').
    """
    suffix = "_ended.fasta"
    if header_whitespace:
        suffix = "_ended_h.fasta"  # make suffix match QC steps taken
    (out_path, out_basename, out_ext) = general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_new_line = out_path + "/" + out_basename + suffix
    broken_fasta = general.open_file(file)
    fixed_fasta = general.open_write_file(file_with_new_line)
    header_pattern = re.compile("^>.*")
    header = ""
    for line in broken_fasta:
        line = line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub("\s+", "_", header)
            line = header
        fixed_fasta.write(line + "\n")
    fixed_fasta.close()
    broken_fasta.close()
    return file_with_new_line
def trim_template(forward,reverse,adapter_fasta,out_dir):
    '''
        Template for paired end scripts
    '''
    # ADAPTERS = TruSeq3-PE.fa for first dataset, or TruSeq-3-PE-2.fa for
    # second dataset, or TruSeq-3-SE.fa to force 'simple mode' only
    # (Supplementary table 2)
    # SW = sliding window quality cutoff, values from 2-35 were tested
    # S = stringency for maximum information mode, values from 0.1-0.9
    # (with 0.1 increments), 0.91-0.99 (with 0.01 increments) and 0.991 to 0999
    # (with 0.001 increments) were tested
    (f_path,f_basename,f_ext)=general.parse_filename(forward)
    new_pair_forward_fastq = out_dir + '/' + f_basename + '_c_pair.fastq'
    new_single_forward_fastq = out_dir + '/' + f_basename + '_c_single.fastq'
    (r_path,r_basename,r_ext)=general.parse_filename(reverse)
    new_pair_reverse_fastq = out_dir + '/' + r_basename + '_c_pair.fastq'
    new_single_reverse_fastq = out_dir + '/' + r_basename + '_c_single.fastq'
    code='java -jar ' + path_to_trimmomatic + ' PE -threads 16 -phred33 ' + forward + ' ' + reverse + ' ' + new_pair_forward_fastq + ' ' + new_single_forward_fastq + ' ' + new_pair_reverse_fastq + ' ' + new_single_reverse_fastq + ' ILLUMINACLIP:' + adapter_fasta + ':2:30:12:1:true LEADING:3 MAXINFO:40:0.8 MINLEN:90\n'
    return(code)
示例#6
0
def fix_wrap(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None):
    '''
        Wraps text in a FASTA file so that no line of sequence has more
        than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'.
    '''
    suffix = '_wrap.fasta'
    if 'header_whitespace' in qc_set_func:
        suffix = '_wrap_h.fasta'
    (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_wrapping = out_path + '/' + out_basename + suffix
    fixed_fasta = general.open_write_file(file_with_wrapping)
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    dna = ''
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):  # Print headers immediately to new file
            header = line
            if 'header_whitespace' in qc_set_func:
                header = re.sub('\s+', '_', header)
            if dna:
                fixed_fasta.write(dna + '\n')  # print remaining sequence
                # before header
            fixed_fasta.write(header + '\n')
            dna = ''  # Reset DNA
        else:  # if the line is sequence data dump sequence as fast as it is
            # long enough to wrap
            dna = dna + line
            while len(dna) > 59:  # Wrap sequence lines after
                # 60 bases
                wrap_line = dna[0:60]
                dna = dna[60:len(dna)]
                fixed_fasta.write(wrap_line + '\n')
    # Catch the last record
    else:  # For end of file
        if dna:
            fixed_fasta.write(dna + '\n')  # print remaining sequence
        # before header
    fixed_fasta.close()
    infile.close()
    remove_set = set(['wrap', 'new_line',
                      'header_whitespace'])  # Remove all three qc steps
    # becuase all will be corrected in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set)  # skip finished repairs
    checked_remove_set = set(['wrap'])
    checked_qc_set_func = checked_qc_set_func.difference(
        checked_remove_set)  # skip finished checks
    return (file_with_wrapping, qc_set_func, checked_qc_set_func)
def fix_wrap(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None):
    '''
        Wraps text in a FASTA file so that no line of sequence has more
        than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'.
    '''
    suffix = '_wrap.fasta'
    if 'header_whitespace' in qc_set_func:
        suffix = '_wrap_h.fasta'
    (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_wrapping = out_path + '/' + out_basename + suffix
    fixed_fasta=general.open_write_file(file_with_wrapping)
    header_pattern = re.compile('^>.*')
    infile = general.open_file(fasta_file_name)
    dna    = ''
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line): # Print headers immediately to new file
            header = line
            if 'header_whitespace' in qc_set_func:
                header = re.sub('\s+', '_', header)
            if dna:
                fixed_fasta.write(dna + '\n') # print remaining sequence
                # before header
            fixed_fasta.write(header + '\n')
            dna = '' # Reset DNA
        else: # if the line is sequence data dump sequence as fast as it is
            # long enough to wrap
            dna = dna + line
            while len(dna) > 59: # Wrap sequence lines after
                # 60 bases
                wrap_line = dna[0:60]
                dna = dna[60:len(dna)]
                fixed_fasta.write(wrap_line + '\n')
    # Catch the last record
    else: # For end of file
        if dna:
            fixed_fasta.write(dna + '\n') # print remaining sequence
        # before header
    fixed_fasta.close()
    infile.close()
    remove_set = set(['wrap','new_line','header_whitespace']) # Remove all three qc steps
    # becuase all will be corrected in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs
    checked_remove_set = set(['wrap'])
    checked_qc_set_func = checked_qc_set_func.difference(checked_remove_set) # skip finished checks
    return(file_with_wrapping, qc_set_func, checked_qc_set_func)
def fix_wrap(file, header_whitespace=False, out_dir=None):
    '''
        Wraps text in a FASTA file so that no line of sequence has more 
        than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'.
    '''
    suffix = '_wrap.fasta'
    if header_whitespace:
        suffix = '_wrap_h.fasta'
    (out_path,out_basename,out_ext)=general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_wrapping = out_path + '/' + out_basename + suffix
    fixed_fasta=general.open_write_file(file_with_wrapping)
    header_pattern = re.compile('^>.*')
    infile = general.open_file(file)
    header = '';
    dna    = '';
    records = []
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):
            if dna:
                records.append([header,dna])
                dna = ''
            header = line
            if header_whitespace:
                header = re.sub('\s+', '_', header)
        else:
            dna = dna + line

    # Catch the last record
    if dna and header:
        records.append([header,dna])

    for record in records:
        header, dna = record
        fixed_fasta.write(header + '\n')
        wrap = textwrap.fill(dna,60) # Wrap sequence lines after 60 bases
        fixed_fasta.write(wrap + '\n')

    fixed_fasta.close()
    infile.close()

    return(file_with_wrapping)
def fix_wrap(file, header_whitespace=False, out_dir=None):
    """
        Wraps text in a FASTA file so that no line of sequence has more 
        than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'.
    """
    suffix = "_wrap.fasta"
    if header_whitespace:
        suffix = "_wrap_h.fasta"
    (out_path, out_basename, out_ext) = general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_wrapping = out_path + "/" + out_basename + suffix
    fixed_fasta = general.open_write_file(file_with_wrapping)
    header_pattern = re.compile("^>.*")
    infile = general.open_file(file)
    dna = ""
    header = ""
    for line in infile:
        line = line.rstrip()
        if header_pattern.match(line):
            if not dna == "":  # skip the first (empty record)
                fixed_fasta.write(header + "\n")
                wrap = textwrap.fill(dna, 60)  # Wrap sequence lines after
                # 60 bases
                fixed_fasta.write(wrap + "\n")
            header = line
            if header_whitespace:
                header = re.sub("\s+", "_", header)  # Gets rid of
                # whitespace in the headers
            new_dna = next(infile)
            new_dna = new_dna.rstrip()
            dna = new_dna
        else:
            dna = dna + line
    else:  # For end of file
        fixed_fasta.write(header + "\n")
        wrap = textwrap.fill(dna, 60)  # Wrap sequence lines after
        # 60 bases
        fixed_fasta.write(wrap + "\n")
    fixed_fasta.close()
    infile.close()
    return file_with_wrapping
示例#10
0
def fix_new_line(fasta_file_name,
                 qc_set_func,
                 checked_qc_set_func,
                 out_dir=None):
    '''
        Strips any new line character ('\\n' or '\\r') from each line in
        file and ends each line (including the last line) with a new 
        line character ('\\n').
    '''
    suffix = '_ended.fasta'
    if 'header_whitespace' in qc_set_func:
        suffix = '_ended_h.fasta'  # make suffix match QC steps taken
    (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_new_line = out_path + '/' + out_basename + suffix
    if sys.version_info > (3, 0):
        broken_fasta = general.open_file(fasta_file_name)
    else:
        broken_fasta = open(fasta_file_name, 'rU')
    fixed_fasta = general.open_write_file(file_with_new_line)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line = line.rstrip()
        if header_pattern.match(line):
            header = line
            if 'header_whitespace' in qc_set_func:
                header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    remove_set = set(['new_line', 'header_whitespace'])  # Remove both qc steps
    # becuase they will be corrected in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set)  # skip finished repairs
    checked_remove_set = set(['new_line'])
    checked_qc_set_func = checked_qc_set_func.difference(
        checked_remove_set)  # skip finished checks
    return (file_with_new_line, qc_set_func, checked_qc_set_func)
示例#11
0
def fix_fasta_file(file, out_dir=None):
    '''
        Passes a file through biopython SeqIO to remove common
        formatting issues like '\r' characters and unwrapped sequences.
        The new file is saved with the suffix '_clean.fasta'.
    '''

    # Give up early if the file does not look like fasta
    assert check_header_pattern(
        file), "Sorry, " + str(file) + " does not look like FASTA to me"

    suffix = '_clean.fa'
    (out_path, out_basename, out_ext) = general.parse_filename(file)
    if out_dir is not None:
        os.system("mkdir -p " + out_dir)
        out_path = out_dir

    fixed_file = out_path + '/' + out_basename + suffix
    out_handle = general.open_write_file(fixed_file)
    fasta_in = SeqIO.parse(file, 'fasta')

    # Iterate through the records to remove white-space
    # from the ID line
    new_records = []
    for record in fasta_in:
        header = re.sub('\s+', '_', record.description)
        record.id = header
        record.name = header
        record.description = ''
        new_records.append(record)

    written = SeqIO.write(new_records, out_handle, 'fasta')

    print str(written) + ' sequence records stored in ' + fixed_file

    return (fixed_file)
示例#12
0
def fix_headers(file, out_dir=None):
    """
        Remove white spaces that break Trimmomatic and some other bioinfo tools 
        from the headers of a FASTA file. Fixed FASTA file is saved with the 
        suffix '_h.fasta'.
    """
    (out_path, out_basename, out_ext) = general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir  # switch to user specified output directory
    file_with_header = out_path + "/" + out_basename + "_h.fasta"
    broken_fasta = general.open_file(file)
    fixed_fasta = general.open_write_file(file_with_header)
    header_pattern = re.compile("^>.*")
    header = ""
    for line in broken_fasta:
        line = line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub("\s+", "_", header)
            line = header
        fixed_fasta.write(line + "\n")
    fixed_fasta.close()
    broken_fasta.close()
    return file_with_header
def fix_new_line(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None):
    '''
        Strips any new line character ('\\n' or '\\r') from each line in
        file and ends each line (including the last line) with a new 
        line character ('\\n').
    '''
    suffix = '_ended.fasta'
    if 'header_whitespace' in qc_set_func:
        suffix = '_ended_h.fasta' # make suffix match QC steps taken
    (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_new_line = out_path + '/' +  out_basename + suffix
    if sys.version_info > (3, 0):
        broken_fasta=general.open_file(fasta_file_name)
    else:
        broken_fasta = open(fasta_file_name, 'rU')
    fixed_fasta=general.open_write_file(file_with_new_line)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line=line.rstrip()
        if header_pattern.match(line):
            header = line
            if 'header_whitespace' in qc_set_func:
                header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    remove_set = set(['new_line','header_whitespace']) # Remove both qc steps
    # becuase they will be corrected in the final FASTA file
    qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs
    checked_remove_set = set(['new_line'])
    checked_qc_set_func = checked_qc_set_func.difference(checked_remove_set) # skip finished checks
    return(file_with_new_line, qc_set_func, checked_qc_set_func)
def fix_headers(file, out_dir=None):
    '''
        Remove white spaces that break Trimmomatic and some other bioinfo tools 
        from the headers of a FASTA file. Fixed FASTA file is saved with the 
        suffix '_h.fasta'.
    '''
    (out_path,out_basename,out_ext)=general.parse_filename(file)
    if out_dir is not None:
        out_path = out_dir # switch to user specified output directory
    file_with_header = out_path + '/' +  out_basename + '_h.fasta'
    broken_fasta=general.open_file(file)
    fixed_fasta=general.open_write_file(file_with_header)
    header_pattern = re.compile('^>.*')
    header = ''
    for line in broken_fasta:
        line=line.rstrip()
        if header_pattern.match(line):
            header = line
            header = re.sub('\s+', '_', header)
            line = header
        fixed_fasta.write(line + '\n')
    fixed_fasta.close()
    broken_fasta.close()
    return(file_with_header)
示例#15
0
def main():
    '''
        For a given FASTA file function runs all qc steps listed in the
        list of steps.
        
        USAGE: python fasta_o_matic.py [-h] [-v] [-q] [-c] -f FILE -s STEPS
        
        QC STEPS:
        
        unique - Checks if FASTA headers have unique first words or can be made unique automatically. May save altered file with suffix '_h.fasta'.
        
        new_line - Checks if the last line in a FASTA file ends in the
        standard new line character ('\\n') and will also fail if the sequence 
        lines end in the less common '\\r' character. Reformatted files are 
        saved with the '_ended.fasta' suffix.
        
        wrap - Checks if the sequence lines in a FASTA file exceed 80 characters and if all the wrapped lines are the same length (this should be true if the FASTA file is  wrapped). Wrapped file is saved with the suffix
        '_wrap.fasta'.
        
        header_whitespace - Remove white spaces from the headers of a FASTA file.
        Fixed FASTA file is saved with the suffix '_h.fasta'.
    '''
    ######################################################################
    ############        Get commandline arguments             ############
    ######################################################################
    parser = argparse.ArgumentParser(
        description='DESCRIPTION: Script runs quality checking and filtering \
                                     based on a user-defined list of quality \
                                     checks. Command-line options that may be \
                                     omitted (i.e. are NOT required) are shown \
                                     in square brackets.\
                                     \
                                     QC STEPS:\
                                     \
                                     unique -- checks if FASTA headers have unique\
                                     first words or can be made unique \
                                     automatically. May save altered file with \
                                     suffix \'_h.fasta\'.\
                                     \
                                     new_line -- checks if the last line in a FASTA file ends in the standard new line \
                                     character (\'\\n\') and will also fail if \
                                     the sequence lines end in the less common \
                                     \'\\r\' character. Reformatted files are \
                                     saved with the \'_ended.fasta\' suffix.\
                                     \
                                     wrap -- checks if the sequence lines in a \
                                     FASTA file exceed 80 characters and if all \
                                     the wrapped lines are the same length \
                                     (this should be true if the FASTA file is \
                                     wrapped). Wrapped file is saved with the \
                                     suffix \'_wrap.fasta\'.\
                                     \
                                     header_whitespace -- remove white spaces \
                                     from the headers of a FASTA file. Fixed \
                                     FASTA file is saved with the suffix \
                                     \'_h.fasta\'.')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        dest='verbose',
                        help='Runs reporting status updates',
                        default=True)
    parser.add_argument('-q',
                        '--quiet',
                        action='store_false',
                        dest='verbose',
                        help='Does not report status updates')
    parser.add_argument('-c',
                        '--colorized',
                        help='Colorizes log reports. Use only if printing \
                     output to screen.',
                        action='store_true',
                        dest='colorized')
    parser.add_argument(
        '-f',
        '--fasta',
        dest='fasta_file_name',
        help='This is the the full path (path and filename) of \
                     the user provided FASTA file.',
        required=True)
    parser.add_argument('-s',
                        '--qc_steps',
                        nargs='+',
                        dest='steps',
                        help='List of QC steps to  perform on FASTA file \
                     options are wrap, new_line, header_whitespace, unique \
                     (default= -s wrap new_line unique).',
                        default=['wrap', 'new_line', 'unique'],
                        required=False)
    parser.add_argument(
        '-o',
        '--out_dir',
        dest='out_dir',
        help=
        'Output directory for any repaired FASTA created (no trailing slash).',
        default=None,
        required=False)
    args = parser.parse_args()
    if args.verbose:
        log.basicConfig(format='%(levelname)s:  %(message)s', level=log.DEBUG)
        doc()
        log.info(
            'Output is verbose. Run with -q, --quiet flag to suppress full output.'
        )
    else:
        log.basicConfig(format='%(levelname)s: %(message)s')
    if args.colorized:
        import Colorer
    log.info('#######################################')
    log.info('# Unit testing...')
    log.info('#######################################')
    log.disable(log.CRITICAL)
    (out_path, out_basename,
     out_ext) = general.parse_filename(args.fasta_file_name)
    if args.out_dir is not None:
        out_path = args.out_dir  # switch to user specified output directory
    test_reformatting(out_path)
    log.disable(log.NOTSET)
    log.info('#######################################')
    log.info('# Done unit testing.')
    log.info('#######################################')
    # Run reformatting
    final_fasta_file_name = run_steps(args.fasta_file_name, args.steps,
                                      args.out_dir)
    return (final_fasta_file_name)
def main():
    '''
        For a given FASTA file function runs all qc steps listed in the
        list of steps.
        
        USAGE: python fasta_o_matic.py [-h] [-v] [-q] [-c] -f FILE -s STEPS
        
        QC STEPS:
        
        unique - Checks if FASTA headers have unique first words or can be made unique automatically. May save altered file with suffix '_h.fasta'.
        
        new_line - Checks if the last line in a FASTA file ends in the
        standard new line character ('\\n') and will also fail if the sequence 
        lines end in the less common '\\r' character. Reformatted files are 
        saved with the '_ended.fasta' suffix.
        
        wrap - Checks if the sequence lines in a FASTA file exceed 80 characters and if all the wrapped lines are the same length (this should be true if the FASTA file is  wrapped). Wrapped file is saved with the suffix
        '_wrap.fasta'.
        
        header_whitespace - Remove white spaces from the headers of a FASTA file.
        Fixed FASTA file is saved with the suffix '_h.fasta'.
    '''
    ######################################################################
    ############        Get commandline arguments             ############
    ######################################################################
    parser = argparse.ArgumentParser(
                                     description='DESCRIPTION: Script runs quality checking and filtering \
                                     based on a user-defined list of quality \
                                     checks. Command-line options that may be \
                                     omitted (i.e. are NOT required) are shown \
                                     in square brackets.\
                                     \
                                     QC STEPS:\
                                     \
                                     unique -- checks if FASTA headers have unique\
                                     first words or can be made unique \
                                     automatically. May save altered file with \
                                     suffix \'_h.fasta\'.\
                                     \
                                     new_line -- checks if the last line in a FASTA file ends in the standard new line \
                                     character (\'\\n\') and will also fail if \
                                     the sequence lines end in the less common \
                                     \'\\r\' character. Reformatted files are \
                                     saved with the \'_ended.fasta\' suffix.\
                                     \
                                     wrap -- checks if the sequence lines in a \
                                     FASTA file exceed 80 characters and if all \
                                     the wrapped lines are the same length \
                                     (this should be true if the FASTA file is \
                                     wrapped). Wrapped file is saved with the \
                                     suffix \'_wrap.fasta\'.\
                                     \
                                     header_whitespace -- remove white spaces \
                                     from the headers of a FASTA file. Fixed \
                                     FASTA file is saved with the suffix \
                                     \'_h.fasta\'.')
    parser.add_argument('-v', '--verbose', action='store_true',
                     dest='verbose', help='Runs reporting status updates',
                     default=True)
    parser.add_argument('-q', '--quiet', action='store_false',
                     dest='verbose', help='Does not report status updates')
    parser.add_argument('-c', '--colorized',
                     help='Colorizes log reports. Use only if printing \
                     output to screen.',action='store_true',dest='colorized')
    parser.add_argument('-f', '--fasta', dest='fasta_file_name',
                     help='This is the the full path (path and filename) of \
                     the user provided FASTA file.', required=True)
    parser.add_argument('-s', '--qc_steps', nargs='+', dest='steps',
                     help='List of QC steps to  perform on FASTA file \
                     options are wrap, new_line, header_whitespace, unique \
                     (default= -s wrap new_line unique).',
                     default=['wrap','new_line','unique'],
                     required=False)
    parser.add_argument('-o', '--out_dir', dest='out_dir',
                        help='Output directory for any repaired FASTA created (no trailing slash).', default=None,required=False)
    args = parser.parse_args()
    if args.verbose:
        log.basicConfig(format='%(levelname)s:  %(message)s', level=log.DEBUG)
        doc()
        log.info('Output is verbose. Run with -q, --quiet flag to suppress full output.')
    else:
        log.basicConfig(format='%(levelname)s: %(message)s')
    if args.colorized:
        import Colorer
    log.info('#######################################')
    log.info('# Unit testing...')
    log.info('#######################################')
    log.disable(log.CRITICAL)
    (out_path,out_basename,out_ext)=general.parse_filename(args.fasta_file_name)
    if args.out_dir is not None:
        out_path = args.out_dir # switch to user specified output directory
    test_reformatting(out_path)
    log.disable(log.NOTSET)
    log.info('#######################################')
    log.info('# Done unit testing.')
    log.info('#######################################')
    # Run reformatting
    final_fasta_file_name = run_steps(args.fasta_file_name, args.steps, args.out_dir)
    return(final_fasta_file_name)
def main():
    '''
        Run full script as opposed to individual script functions.
    '''
    ######################################################################
    ############        Get commandline arguments             ############
    ######################################################################
    parser = argparse.ArgumentParser(
    description='DESCRIPTION: Summarize counts of all four DNA bases. \
                                     Command-line options that may be omitted \
                                     (i.e. are NOT required) are shown in \
                                     square brackets.')
    parser.add_argument('-v', '--verbose', action='store_true',
    dest='verbose', help='Runs reporting status updates',
    default=True)
    parser.add_argument('-q', '--quiet', action='store_false',
                     dest='verbose', help='Does not report status updates')
    parser.add_argument('-c', '--colorized',
                     help='Colorizes log reports. Use only if printing \
                     output to screen.',action='store_true',dest='colorized')
    parser.add_argument('-r', '--read_list', dest='read_list',
                        help='This is the the full path (path and filename) of \
                        the user provided list of read files. The file should \
                        be tab separated with the first read file, then the \
                        second read file (see example_read_list_PE.tab). If a \
                        sample has multiple fastq files for R1 and R2 separate \
                        these with commas (see example_read_list_PE_multi.tab).\
                        For single end reads each line should be a path \
                        to a fastq file. For single end reads each line should \
                        be a path to a fastq file (see example_read_list_SE.tab\
                        )', required=True)
    parser.add_argument('-p', '--project', dest='project',
                     help='The project id. This will be used to name output \
                        (default=project).', default='project', required=False)
    parser.add_argument('-a', '--adapter', dest='adapter',
                        help='The adapter fasta file. This will be used to \
                        clean reads',default='/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa', required=False)
    parser.add_argument('-s', '--single_end', action='store_true', dest='single',
                        help='If your reads are single end use this flag. \
                        Without it the script assumes reads are paired end. \
                        Also skip the second column (the reverse fastq files) \
                        when making your read list', required=False,
                        default=False)
    parser.add_argument('-x', '--convert_header', action='store_true',
                        dest='convert_header', help='If the illumina headers \
                        do not end in /1 or /2 use this parameter to indicat \
                        that headers need to be converted. Check your headers \
                        by typing "head FASTA_FULL_PATH" and read more about \
                        illumina headers at \
                        http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.',
                        default=False, required=False)
    parser.add_argument('-m', '--min_read_length', dest='min_read_length',
                        help='The minimum read length in bp. (Default = 90).',
                        required=False, default=90)
    parser.add_argument('-o', '--out', dest='out',
                        help='Output directory (Default=$HOME)', required=False,
                        default='~')
    parser.add_argument('-d', '--dna', dest='sequence', help='DNA sequence to \
                        summarize', default='TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT',
                        required=False)
    args = parser.parse_args()
    if args.colorized:
        import Colorer
    if args.verbose:
        doc()
        log.basicConfig(format='%(levelname)s:  %(message)s', level=log.DEBUG)
        log.info('Output is verbose. Run with -q, --quiet flag to suppress full output.')
    else:
        log.basicConfig(format='%(levelname)s: %(message)s')
    ######################################################################
    ############      Call custom functions with arguments     ###########
    ######################################################################
    # Get list of read FASTQ files
    #######################################
    print(args.read_list, args.single, args.min_read_length)
    (forwards,reverses) = trimmomatic_template.parse_file(args.read_list,
                                                          args.single)
    #######################################
    # Sanity check read FASTQ files
    #######################################
    index = 0
    for fastq in forwards:
        f_opened_file=general.open_file(forwards[index])
        f_opened_file.close()
        forwards[index] = general.convert_to_full(forwards[index])
        if not args.single:
            r_opened_file=general.open_file(reverses[index])
            r_opened_file.close()
            reverses[index] = general.convert_to_full(reverses[index])
        index += 1
    #######################################
    # Make output directory
    #######################################
    (out_path,out_basename,out_ext)=general.parse_filename(args.out)
    out_dir=out_path + '/' + out_basename
    general.path_check(out_dir) # Sanity check directory
    out_dir= out_dir + '/' + args.project # final out directory is 'project_id'
    general.mk_out_sub_directory(out_dir)
    general.mk_out_sub_directory(out_dir + '/scripts')
    general.mk_out_sub_directory(out_dir + '/qsubs')
    #######################################
    # Write trimmomatic script
    #######################################
    convert=' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > '
    qsub_script = general.open_write_file(out_dir + '/qsubs/qsub_trimmomatic.sh')
    qsub_script.write('#!/bin/bash\n')
    index=0
    args.adapter = fasta_o_matic.run_steps(args.adapter,['wrap', 'new_line','header_whitespace'])
    for fastq in forwards:
        (f_path,f_basename,f_ext)=general.parse_filename(forwards[index])
        qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 '+ out_dir
                          + '/scripts/run_trimmomatic_' + f_basename + '.sh\n' )
        if not args.single:
            (r_path,r_basename,r_ext)=general.parse_filename(reverses[index])
        trim_script = general.open_write_file(out_dir
                                              + '/scripts/run_trimmomatic_'
                                              + f_basename + '.sh')
        trim_script.write('#!/bin/bash\n')
        # Convert headers
        if args.convert_header:
            trim_script.write('# Convert headers:\n')
            new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq'
            trim_script.write('cat ' + forwards[index] + convert
                              + new_forward_fastq + '\n')
            forwards[index] = new_forward_fastq
            if not args.single:
                new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq'
                trim_script.write('cat ' + reverses[index] + convert
                                  + new_reverse_fastq + '\n')
                reverses[index] = new_reverse_fastq
        # Trim sequences
        trim_script.write('# Clean reads:\n')
        if not args.single:
            trim_script.write(trimmomatic_template.trim_template(
                                                                 forwards[index],
                                                                 reverses[index],
                                                                 args.adapter,
                                                                 out_dir))
        else:
            trim_script.write(trimmomatic_template.trim_template_single(forwards[index]))
            # Section in progress... (Remember to point to a SE adapter fasta file
            # by default)
        trim_script.close()
        index += 1
    qsub_script.close()
示例#18
0
def main():
    '''
        Run full script as opposed to individual script functions.
    '''
    ######################################################################
    ############        Get commandline arguments             ############
    ######################################################################
    parser = argparse.ArgumentParser(
        description='DESCRIPTION: Summarize counts of all four DNA bases. \
                                     Command-line options that may be omitted \
                                     (i.e. are NOT required) are shown in \
                                     square brackets.')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        dest='verbose',
                        help='Runs reporting status updates',
                        default=True)
    parser.add_argument('-q',
                        '--quiet',
                        action='store_false',
                        dest='verbose',
                        help='Does not report status updates')
    parser.add_argument('-c',
                        '--colorized',
                        help='Colorizes log reports. Use only if printing \
                     output to screen.',
                        action='store_true',
                        dest='colorized')
    parser.add_argument(
        '-r',
        '--read_list',
        dest='read_list',
        help='This is the the full path (path and filename) of \
                        the user provided list of read files. The file should \
                        be tab separated with the first read file, then the \
                        second read file (see example_read_list_PE.tab). If a \
                        sample has multiple fastq files for R1 and R2 separate \
                        these with commas (see example_read_list_PE_multi.tab).\
                        For single end reads each line should be a path \
                        to a fastq file. For single end reads each line should \
                        be a path to a fastq file (see example_read_list_SE.tab\
                        )',
        required=True)
    parser.add_argument(
        '-p',
        '--project',
        dest='project',
        help='The project id. This will be used to name output \
                        (default=project).',
        default='project',
        required=False)
    parser.add_argument(
        '-a',
        '--adapter',
        dest='adapter',
        help='The adapter fasta file. This will be used to \
                        clean reads',
        default=
        '/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa',
        required=False)
    parser.add_argument('-s',
                        '--single_end',
                        action='store_true',
                        dest='single',
                        help='If your reads are single end use this flag. \
                        Without it the script assumes reads are paired end. \
                        Also skip the second column (the reverse fastq files) \
                        when making your read list',
                        required=False,
                        default=False)
    parser.add_argument('-x',
                        '--convert_header',
                        action='store_true',
                        dest='convert_header',
                        help='If the illumina headers \
                        do not end in /1 or /2 use this parameter to indicat \
                        that headers need to be converted. Check your headers \
                        by typing "head FASTA_FULL_PATH" and read more about \
                        illumina headers at \
                        http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.',
                        default=False,
                        required=False)
    parser.add_argument('-m',
                        '--min_read_length',
                        dest='min_read_length',
                        help='The minimum read length in bp. (Default = 90).',
                        required=False,
                        default=90)
    parser.add_argument('-o',
                        '--out',
                        dest='out',
                        help='Output directory (Default=$HOME)',
                        required=False,
                        default='~')
    parser.add_argument(
        '-d',
        '--dna',
        dest='sequence',
        help='DNA sequence to \
                        summarize',
        default=
        'TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT',
        required=False)
    args = parser.parse_args()
    if args.colorized:
        import Colorer
    if args.verbose:
        doc()
        log.basicConfig(format='%(levelname)s:  %(message)s', level=log.DEBUG)
        log.info(
            'Output is verbose. Run with -q, --quiet flag to suppress full output.'
        )
    else:
        log.basicConfig(format='%(levelname)s: %(message)s')
    ######################################################################
    ############      Call custom functions with arguments     ###########
    ######################################################################
    # Get list of read FASTQ files
    #######################################
    print(args.read_list, args.single, args.min_read_length)
    (forwards,
     reverses) = trimmomatic_template.parse_file(args.read_list, args.single)
    #######################################
    # Sanity check read FASTQ files
    #######################################
    index = 0
    for fastq in forwards:
        f_opened_file = general.open_file(forwards[index])
        f_opened_file.close()
        forwards[index] = general.convert_to_full(forwards[index])
        if not args.single:
            r_opened_file = general.open_file(reverses[index])
            r_opened_file.close()
            reverses[index] = general.convert_to_full(reverses[index])
        index += 1
    #######################################
    # Make output directory
    #######################################
    (out_path, out_basename, out_ext) = general.parse_filename(args.out)
    out_dir = out_path + '/' + out_basename
    general.path_check(out_dir)  # Sanity check directory
    out_dir = out_dir + '/' + args.project  # final out directory is 'project_id'
    general.mk_out_sub_directory(out_dir)
    general.mk_out_sub_directory(out_dir + '/scripts')
    general.mk_out_sub_directory(out_dir + '/qsubs')
    #######################################
    # Write trimmomatic script
    #######################################
    convert = ' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > '
    qsub_script = general.open_write_file(out_dir +
                                          '/qsubs/qsub_trimmomatic.sh')
    qsub_script.write('#!/bin/bash\n')
    index = 0
    args.adapter = fasta_o_matic.run_steps(
        args.adapter, ['wrap', 'new_line', 'header_whitespace'])
    for fastq in forwards:
        (f_path, f_basename, f_ext) = general.parse_filename(forwards[index])
        qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 ' +
                          out_dir + '/scripts/run_trimmomatic_' + f_basename +
                          '.sh\n')
        if not args.single:
            (r_path, r_basename,
             r_ext) = general.parse_filename(reverses[index])
        trim_script = general.open_write_file(out_dir +
                                              '/scripts/run_trimmomatic_' +
                                              f_basename + '.sh')
        trim_script.write('#!/bin/bash\n')
        # Convert headers
        if args.convert_header:
            trim_script.write('# Convert headers:\n')
            new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq'
            trim_script.write('cat ' + forwards[index] + convert +
                              new_forward_fastq + '\n')
            forwards[index] = new_forward_fastq
            if not args.single:
                new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq'
                trim_script.write('cat ' + reverses[index] + convert +
                                  new_reverse_fastq + '\n')
                reverses[index] = new_reverse_fastq
        # Trim sequences
        trim_script.write('# Clean reads:\n')
        if not args.single:
            trim_script.write(
                trimmomatic_template.trim_template(forwards[index],
                                                   reverses[index],
                                                   args.adapter, out_dir))
        else:
            trim_script.write(
                trimmomatic_template.trim_template_single(forwards[index]))
            # Section in progress... (Remember to point to a SE adapter fasta file
            # by default)
        trim_script.close()
        index += 1
    qsub_script.close()