def fix_fasta_file(file, out_dir=None): ''' Passes a file through biopython SeqIO to remove common formatting issues like '\r' characters and unwrapped sequences. The new file is saved with the suffix '_clean.fasta'. ''' # Give up early if the file does not look like fasta assert check_header_pattern(file), "Sorry, " + str(file) + " does not look like FASTA to me" suffix = '_clean.fa'; (out_path,out_basename,out_ext) = general.parse_filename(file) if out_dir is not None: os.system("mkdir -p "+out_dir); out_path = out_dir fixed_file = out_path + '/' + out_basename + suffix out_handle = general.open_write_file(fixed_file) fasta_in = SeqIO.parse(file,'fasta'); # Iterate through the records to remove white-space # from the ID line new_records = [] for record in fasta_in: header = re.sub('\s+','_',record.description) record.id = header record.name = header record.description = '' new_records.append(record) written = SeqIO.write(new_records, out_handle,'fasta') print str(written) + ' sequence records stored in ' + fixed_file return(fixed_file)
def fix_headers(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Remove white spaces from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. ''' (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_header = out_path + '/' + out_basename + '_h.fasta' broken_fasta = general.open_file(fasta_file_name) fixed_fasta = general.open_write_file(file_with_header) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line = line.rstrip() if header_pattern.match(line): header = line header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() remove_set = set(['header_whitespace' ]) # Remove qc step becuase it will be corrected # in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_qc_set_func = checked_qc_set_func.difference( remove_set) # skip finished checks return (file_with_header, qc_set_func, checked_qc_set_func)
def fix_headers(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Remove white spaces from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. ''' (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_header = out_path + '/' + out_basename + '_h.fasta' broken_fasta=general.open_file(fasta_file_name) fixed_fasta=general.open_write_file(file_with_header) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line=line.rstrip() if header_pattern.match(line): header = line header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() remove_set = set(['header_whitespace']) # Remove qc step becuase it will be corrected # in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_qc_set_func = checked_qc_set_func.difference(remove_set) # skip finished checks return(file_with_header, qc_set_func, checked_qc_set_func)
def fix_new_line(file, header_whitespace=False, out_dir=None): """ Strips any new line character ('\\n' or '\\r') from each line in file and ends each line (including the last line) with a new line character ('\\n'). """ suffix = "_ended.fasta" if header_whitespace: suffix = "_ended_h.fasta" # make suffix match QC steps taken (out_path, out_basename, out_ext) = general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_new_line = out_path + "/" + out_basename + suffix broken_fasta = general.open_file(file) fixed_fasta = general.open_write_file(file_with_new_line) header_pattern = re.compile("^>.*") header = "" for line in broken_fasta: line = line.rstrip() if header_pattern.match(line): header = line header = re.sub("\s+", "_", header) line = header fixed_fasta.write(line + "\n") fixed_fasta.close() broken_fasta.close() return file_with_new_line
def trim_template(forward,reverse,adapter_fasta,out_dir): ''' Template for paired end scripts ''' # ADAPTERS = TruSeq3-PE.fa for first dataset, or TruSeq-3-PE-2.fa for # second dataset, or TruSeq-3-SE.fa to force 'simple mode' only # (Supplementary table 2) # SW = sliding window quality cutoff, values from 2-35 were tested # S = stringency for maximum information mode, values from 0.1-0.9 # (with 0.1 increments), 0.91-0.99 (with 0.01 increments) and 0.991 to 0999 # (with 0.001 increments) were tested (f_path,f_basename,f_ext)=general.parse_filename(forward) new_pair_forward_fastq = out_dir + '/' + f_basename + '_c_pair.fastq' new_single_forward_fastq = out_dir + '/' + f_basename + '_c_single.fastq' (r_path,r_basename,r_ext)=general.parse_filename(reverse) new_pair_reverse_fastq = out_dir + '/' + r_basename + '_c_pair.fastq' new_single_reverse_fastq = out_dir + '/' + r_basename + '_c_single.fastq' code='java -jar ' + path_to_trimmomatic + ' PE -threads 16 -phred33 ' + forward + ' ' + reverse + ' ' + new_pair_forward_fastq + ' ' + new_single_forward_fastq + ' ' + new_pair_reverse_fastq + ' ' + new_single_reverse_fastq + ' ILLUMINACLIP:' + adapter_fasta + ':2:30:12:1:true LEADING:3 MAXINFO:40:0.8 MINLEN:90\n' return(code)
def fix_wrap(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Wraps text in a FASTA file so that no line of sequence has more than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'. ''' suffix = '_wrap.fasta' if 'header_whitespace' in qc_set_func: suffix = '_wrap_h.fasta' (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_wrapping = out_path + '/' + out_basename + suffix fixed_fasta = general.open_write_file(file_with_wrapping) header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) dna = '' for line in infile: line = line.rstrip() if header_pattern.match(line): # Print headers immediately to new file header = line if 'header_whitespace' in qc_set_func: header = re.sub('\s+', '_', header) if dna: fixed_fasta.write(dna + '\n') # print remaining sequence # before header fixed_fasta.write(header + '\n') dna = '' # Reset DNA else: # if the line is sequence data dump sequence as fast as it is # long enough to wrap dna = dna + line while len(dna) > 59: # Wrap sequence lines after # 60 bases wrap_line = dna[0:60] dna = dna[60:len(dna)] fixed_fasta.write(wrap_line + '\n') # Catch the last record else: # For end of file if dna: fixed_fasta.write(dna + '\n') # print remaining sequence # before header fixed_fasta.close() infile.close() remove_set = set(['wrap', 'new_line', 'header_whitespace']) # Remove all three qc steps # becuase all will be corrected in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_remove_set = set(['wrap']) checked_qc_set_func = checked_qc_set_func.difference( checked_remove_set) # skip finished checks return (file_with_wrapping, qc_set_func, checked_qc_set_func)
def fix_wrap(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Wraps text in a FASTA file so that no line of sequence has more than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'. ''' suffix = '_wrap.fasta' if 'header_whitespace' in qc_set_func: suffix = '_wrap_h.fasta' (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_wrapping = out_path + '/' + out_basename + suffix fixed_fasta=general.open_write_file(file_with_wrapping) header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) dna = '' for line in infile: line = line.rstrip() if header_pattern.match(line): # Print headers immediately to new file header = line if 'header_whitespace' in qc_set_func: header = re.sub('\s+', '_', header) if dna: fixed_fasta.write(dna + '\n') # print remaining sequence # before header fixed_fasta.write(header + '\n') dna = '' # Reset DNA else: # if the line is sequence data dump sequence as fast as it is # long enough to wrap dna = dna + line while len(dna) > 59: # Wrap sequence lines after # 60 bases wrap_line = dna[0:60] dna = dna[60:len(dna)] fixed_fasta.write(wrap_line + '\n') # Catch the last record else: # For end of file if dna: fixed_fasta.write(dna + '\n') # print remaining sequence # before header fixed_fasta.close() infile.close() remove_set = set(['wrap','new_line','header_whitespace']) # Remove all three qc steps # becuase all will be corrected in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_remove_set = set(['wrap']) checked_qc_set_func = checked_qc_set_func.difference(checked_remove_set) # skip finished checks return(file_with_wrapping, qc_set_func, checked_qc_set_func)
def fix_wrap(file, header_whitespace=False, out_dir=None): ''' Wraps text in a FASTA file so that no line of sequence has more than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'. ''' suffix = '_wrap.fasta' if header_whitespace: suffix = '_wrap_h.fasta' (out_path,out_basename,out_ext)=general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_wrapping = out_path + '/' + out_basename + suffix fixed_fasta=general.open_write_file(file_with_wrapping) header_pattern = re.compile('^>.*') infile = general.open_file(file) header = ''; dna = ''; records = [] for line in infile: line = line.rstrip() if header_pattern.match(line): if dna: records.append([header,dna]) dna = '' header = line if header_whitespace: header = re.sub('\s+', '_', header) else: dna = dna + line # Catch the last record if dna and header: records.append([header,dna]) for record in records: header, dna = record fixed_fasta.write(header + '\n') wrap = textwrap.fill(dna,60) # Wrap sequence lines after 60 bases fixed_fasta.write(wrap + '\n') fixed_fasta.close() infile.close() return(file_with_wrapping)
def fix_wrap(file, header_whitespace=False, out_dir=None): """ Wraps text in a FASTA file so that no line of sequence has more than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'. """ suffix = "_wrap.fasta" if header_whitespace: suffix = "_wrap_h.fasta" (out_path, out_basename, out_ext) = general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_wrapping = out_path + "/" + out_basename + suffix fixed_fasta = general.open_write_file(file_with_wrapping) header_pattern = re.compile("^>.*") infile = general.open_file(file) dna = "" header = "" for line in infile: line = line.rstrip() if header_pattern.match(line): if not dna == "": # skip the first (empty record) fixed_fasta.write(header + "\n") wrap = textwrap.fill(dna, 60) # Wrap sequence lines after # 60 bases fixed_fasta.write(wrap + "\n") header = line if header_whitespace: header = re.sub("\s+", "_", header) # Gets rid of # whitespace in the headers new_dna = next(infile) new_dna = new_dna.rstrip() dna = new_dna else: dna = dna + line else: # For end of file fixed_fasta.write(header + "\n") wrap = textwrap.fill(dna, 60) # Wrap sequence lines after # 60 bases fixed_fasta.write(wrap + "\n") fixed_fasta.close() infile.close() return file_with_wrapping
def fix_new_line(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Strips any new line character ('\\n' or '\\r') from each line in file and ends each line (including the last line) with a new line character ('\\n'). ''' suffix = '_ended.fasta' if 'header_whitespace' in qc_set_func: suffix = '_ended_h.fasta' # make suffix match QC steps taken (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_new_line = out_path + '/' + out_basename + suffix if sys.version_info > (3, 0): broken_fasta = general.open_file(fasta_file_name) else: broken_fasta = open(fasta_file_name, 'rU') fixed_fasta = general.open_write_file(file_with_new_line) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line = line.rstrip() if header_pattern.match(line): header = line if 'header_whitespace' in qc_set_func: header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() remove_set = set(['new_line', 'header_whitespace']) # Remove both qc steps # becuase they will be corrected in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_remove_set = set(['new_line']) checked_qc_set_func = checked_qc_set_func.difference( checked_remove_set) # skip finished checks return (file_with_new_line, qc_set_func, checked_qc_set_func)
def fix_fasta_file(file, out_dir=None): ''' Passes a file through biopython SeqIO to remove common formatting issues like '\r' characters and unwrapped sequences. The new file is saved with the suffix '_clean.fasta'. ''' # Give up early if the file does not look like fasta assert check_header_pattern( file), "Sorry, " + str(file) + " does not look like FASTA to me" suffix = '_clean.fa' (out_path, out_basename, out_ext) = general.parse_filename(file) if out_dir is not None: os.system("mkdir -p " + out_dir) out_path = out_dir fixed_file = out_path + '/' + out_basename + suffix out_handle = general.open_write_file(fixed_file) fasta_in = SeqIO.parse(file, 'fasta') # Iterate through the records to remove white-space # from the ID line new_records = [] for record in fasta_in: header = re.sub('\s+', '_', record.description) record.id = header record.name = header record.description = '' new_records.append(record) written = SeqIO.write(new_records, out_handle, 'fasta') print str(written) + ' sequence records stored in ' + fixed_file return (fixed_file)
def fix_headers(file, out_dir=None): """ Remove white spaces that break Trimmomatic and some other bioinfo tools from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. """ (out_path, out_basename, out_ext) = general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_header = out_path + "/" + out_basename + "_h.fasta" broken_fasta = general.open_file(file) fixed_fasta = general.open_write_file(file_with_header) header_pattern = re.compile("^>.*") header = "" for line in broken_fasta: line = line.rstrip() if header_pattern.match(line): header = line header = re.sub("\s+", "_", header) line = header fixed_fasta.write(line + "\n") fixed_fasta.close() broken_fasta.close() return file_with_header
def fix_new_line(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Strips any new line character ('\\n' or '\\r') from each line in file and ends each line (including the last line) with a new line character ('\\n'). ''' suffix = '_ended.fasta' if 'header_whitespace' in qc_set_func: suffix = '_ended_h.fasta' # make suffix match QC steps taken (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_new_line = out_path + '/' + out_basename + suffix if sys.version_info > (3, 0): broken_fasta=general.open_file(fasta_file_name) else: broken_fasta = open(fasta_file_name, 'rU') fixed_fasta=general.open_write_file(file_with_new_line) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line=line.rstrip() if header_pattern.match(line): header = line if 'header_whitespace' in qc_set_func: header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() remove_set = set(['new_line','header_whitespace']) # Remove both qc steps # becuase they will be corrected in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_remove_set = set(['new_line']) checked_qc_set_func = checked_qc_set_func.difference(checked_remove_set) # skip finished checks return(file_with_new_line, qc_set_func, checked_qc_set_func)
def fix_headers(file, out_dir=None): ''' Remove white spaces that break Trimmomatic and some other bioinfo tools from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. ''' (out_path,out_basename,out_ext)=general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_header = out_path + '/' + out_basename + '_h.fasta' broken_fasta=general.open_file(file) fixed_fasta=general.open_write_file(file_with_header) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line=line.rstrip() if header_pattern.match(line): header = line header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() return(file_with_header)
def main(): ''' For a given FASTA file function runs all qc steps listed in the list of steps. USAGE: python fasta_o_matic.py [-h] [-v] [-q] [-c] -f FILE -s STEPS QC STEPS: unique - Checks if FASTA headers have unique first words or can be made unique automatically. May save altered file with suffix '_h.fasta'. new_line - Checks if the last line in a FASTA file ends in the standard new line character ('\\n') and will also fail if the sequence lines end in the less common '\\r' character. Reformatted files are saved with the '_ended.fasta' suffix. wrap - Checks if the sequence lines in a FASTA file exceed 80 characters and if all the wrapped lines are the same length (this should be true if the FASTA file is wrapped). Wrapped file is saved with the suffix '_wrap.fasta'. header_whitespace - Remove white spaces from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. ''' ###################################################################### ############ Get commandline arguments ############ ###################################################################### parser = argparse.ArgumentParser( description='DESCRIPTION: Script runs quality checking and filtering \ based on a user-defined list of quality \ checks. Command-line options that may be \ omitted (i.e. are NOT required) are shown \ in square brackets.\ \ QC STEPS:\ \ unique -- checks if FASTA headers have unique\ first words or can be made unique \ automatically. May save altered file with \ suffix \'_h.fasta\'.\ \ new_line -- checks if the last line in a FASTA file ends in the standard new line \ character (\'\\n\') and will also fail if \ the sequence lines end in the less common \ \'\\r\' character. Reformatted files are \ saved with the \'_ended.fasta\' suffix.\ \ wrap -- checks if the sequence lines in a \ FASTA file exceed 80 characters and if all \ the wrapped lines are the same length \ (this should be true if the FASTA file is \ wrapped). Wrapped file is saved with the \ suffix \'_wrap.fasta\'.\ \ header_whitespace -- remove white spaces \ from the headers of a FASTA file. Fixed \ FASTA file is saved with the suffix \ \'_h.fasta\'.') parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Runs reporting status updates', default=True) parser.add_argument('-q', '--quiet', action='store_false', dest='verbose', help='Does not report status updates') parser.add_argument('-c', '--colorized', help='Colorizes log reports. Use only if printing \ output to screen.', action='store_true', dest='colorized') parser.add_argument( '-f', '--fasta', dest='fasta_file_name', help='This is the the full path (path and filename) of \ the user provided FASTA file.', required=True) parser.add_argument('-s', '--qc_steps', nargs='+', dest='steps', help='List of QC steps to perform on FASTA file \ options are wrap, new_line, header_whitespace, unique \ (default= -s wrap new_line unique).', default=['wrap', 'new_line', 'unique'], required=False) parser.add_argument( '-o', '--out_dir', dest='out_dir', help= 'Output directory for any repaired FASTA created (no trailing slash).', default=None, required=False) args = parser.parse_args() if args.verbose: log.basicConfig(format='%(levelname)s: %(message)s', level=log.DEBUG) doc() log.info( 'Output is verbose. Run with -q, --quiet flag to suppress full output.' ) else: log.basicConfig(format='%(levelname)s: %(message)s') if args.colorized: import Colorer log.info('#######################################') log.info('# Unit testing...') log.info('#######################################') log.disable(log.CRITICAL) (out_path, out_basename, out_ext) = general.parse_filename(args.fasta_file_name) if args.out_dir is not None: out_path = args.out_dir # switch to user specified output directory test_reformatting(out_path) log.disable(log.NOTSET) log.info('#######################################') log.info('# Done unit testing.') log.info('#######################################') # Run reformatting final_fasta_file_name = run_steps(args.fasta_file_name, args.steps, args.out_dir) return (final_fasta_file_name)
def main(): ''' For a given FASTA file function runs all qc steps listed in the list of steps. USAGE: python fasta_o_matic.py [-h] [-v] [-q] [-c] -f FILE -s STEPS QC STEPS: unique - Checks if FASTA headers have unique first words or can be made unique automatically. May save altered file with suffix '_h.fasta'. new_line - Checks if the last line in a FASTA file ends in the standard new line character ('\\n') and will also fail if the sequence lines end in the less common '\\r' character. Reformatted files are saved with the '_ended.fasta' suffix. wrap - Checks if the sequence lines in a FASTA file exceed 80 characters and if all the wrapped lines are the same length (this should be true if the FASTA file is wrapped). Wrapped file is saved with the suffix '_wrap.fasta'. header_whitespace - Remove white spaces from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. ''' ###################################################################### ############ Get commandline arguments ############ ###################################################################### parser = argparse.ArgumentParser( description='DESCRIPTION: Script runs quality checking and filtering \ based on a user-defined list of quality \ checks. Command-line options that may be \ omitted (i.e. are NOT required) are shown \ in square brackets.\ \ QC STEPS:\ \ unique -- checks if FASTA headers have unique\ first words or can be made unique \ automatically. May save altered file with \ suffix \'_h.fasta\'.\ \ new_line -- checks if the last line in a FASTA file ends in the standard new line \ character (\'\\n\') and will also fail if \ the sequence lines end in the less common \ \'\\r\' character. Reformatted files are \ saved with the \'_ended.fasta\' suffix.\ \ wrap -- checks if the sequence lines in a \ FASTA file exceed 80 characters and if all \ the wrapped lines are the same length \ (this should be true if the FASTA file is \ wrapped). Wrapped file is saved with the \ suffix \'_wrap.fasta\'.\ \ header_whitespace -- remove white spaces \ from the headers of a FASTA file. Fixed \ FASTA file is saved with the suffix \ \'_h.fasta\'.') parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Runs reporting status updates', default=True) parser.add_argument('-q', '--quiet', action='store_false', dest='verbose', help='Does not report status updates') parser.add_argument('-c', '--colorized', help='Colorizes log reports. Use only if printing \ output to screen.',action='store_true',dest='colorized') parser.add_argument('-f', '--fasta', dest='fasta_file_name', help='This is the the full path (path and filename) of \ the user provided FASTA file.', required=True) parser.add_argument('-s', '--qc_steps', nargs='+', dest='steps', help='List of QC steps to perform on FASTA file \ options are wrap, new_line, header_whitespace, unique \ (default= -s wrap new_line unique).', default=['wrap','new_line','unique'], required=False) parser.add_argument('-o', '--out_dir', dest='out_dir', help='Output directory for any repaired FASTA created (no trailing slash).', default=None,required=False) args = parser.parse_args() if args.verbose: log.basicConfig(format='%(levelname)s: %(message)s', level=log.DEBUG) doc() log.info('Output is verbose. Run with -q, --quiet flag to suppress full output.') else: log.basicConfig(format='%(levelname)s: %(message)s') if args.colorized: import Colorer log.info('#######################################') log.info('# Unit testing...') log.info('#######################################') log.disable(log.CRITICAL) (out_path,out_basename,out_ext)=general.parse_filename(args.fasta_file_name) if args.out_dir is not None: out_path = args.out_dir # switch to user specified output directory test_reformatting(out_path) log.disable(log.NOTSET) log.info('#######################################') log.info('# Done unit testing.') log.info('#######################################') # Run reformatting final_fasta_file_name = run_steps(args.fasta_file_name, args.steps, args.out_dir) return(final_fasta_file_name)
def main(): ''' Run full script as opposed to individual script functions. ''' ###################################################################### ############ Get commandline arguments ############ ###################################################################### parser = argparse.ArgumentParser( description='DESCRIPTION: Summarize counts of all four DNA bases. \ Command-line options that may be omitted \ (i.e. are NOT required) are shown in \ square brackets.') parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Runs reporting status updates', default=True) parser.add_argument('-q', '--quiet', action='store_false', dest='verbose', help='Does not report status updates') parser.add_argument('-c', '--colorized', help='Colorizes log reports. Use only if printing \ output to screen.',action='store_true',dest='colorized') parser.add_argument('-r', '--read_list', dest='read_list', help='This is the the full path (path and filename) of \ the user provided list of read files. The file should \ be tab separated with the first read file, then the \ second read file (see example_read_list_PE.tab). If a \ sample has multiple fastq files for R1 and R2 separate \ these with commas (see example_read_list_PE_multi.tab).\ For single end reads each line should be a path \ to a fastq file. For single end reads each line should \ be a path to a fastq file (see example_read_list_SE.tab\ )', required=True) parser.add_argument('-p', '--project', dest='project', help='The project id. This will be used to name output \ (default=project).', default='project', required=False) parser.add_argument('-a', '--adapter', dest='adapter', help='The adapter fasta file. This will be used to \ clean reads',default='/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa', required=False) parser.add_argument('-s', '--single_end', action='store_true', dest='single', help='If your reads are single end use this flag. \ Without it the script assumes reads are paired end. \ Also skip the second column (the reverse fastq files) \ when making your read list', required=False, default=False) parser.add_argument('-x', '--convert_header', action='store_true', dest='convert_header', help='If the illumina headers \ do not end in /1 or /2 use this parameter to indicat \ that headers need to be converted. Check your headers \ by typing "head FASTA_FULL_PATH" and read more about \ illumina headers at \ http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.', default=False, required=False) parser.add_argument('-m', '--min_read_length', dest='min_read_length', help='The minimum read length in bp. (Default = 90).', required=False, default=90) parser.add_argument('-o', '--out', dest='out', help='Output directory (Default=$HOME)', required=False, default='~') parser.add_argument('-d', '--dna', dest='sequence', help='DNA sequence to \ summarize', default='TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT', required=False) args = parser.parse_args() if args.colorized: import Colorer if args.verbose: doc() log.basicConfig(format='%(levelname)s: %(message)s', level=log.DEBUG) log.info('Output is verbose. Run with -q, --quiet flag to suppress full output.') else: log.basicConfig(format='%(levelname)s: %(message)s') ###################################################################### ############ Call custom functions with arguments ########### ###################################################################### # Get list of read FASTQ files ####################################### print(args.read_list, args.single, args.min_read_length) (forwards,reverses) = trimmomatic_template.parse_file(args.read_list, args.single) ####################################### # Sanity check read FASTQ files ####################################### index = 0 for fastq in forwards: f_opened_file=general.open_file(forwards[index]) f_opened_file.close() forwards[index] = general.convert_to_full(forwards[index]) if not args.single: r_opened_file=general.open_file(reverses[index]) r_opened_file.close() reverses[index] = general.convert_to_full(reverses[index]) index += 1 ####################################### # Make output directory ####################################### (out_path,out_basename,out_ext)=general.parse_filename(args.out) out_dir=out_path + '/' + out_basename general.path_check(out_dir) # Sanity check directory out_dir= out_dir + '/' + args.project # final out directory is 'project_id' general.mk_out_sub_directory(out_dir) general.mk_out_sub_directory(out_dir + '/scripts') general.mk_out_sub_directory(out_dir + '/qsubs') ####################################### # Write trimmomatic script ####################################### convert=' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > ' qsub_script = general.open_write_file(out_dir + '/qsubs/qsub_trimmomatic.sh') qsub_script.write('#!/bin/bash\n') index=0 args.adapter = fasta_o_matic.run_steps(args.adapter,['wrap', 'new_line','header_whitespace']) for fastq in forwards: (f_path,f_basename,f_ext)=general.parse_filename(forwards[index]) qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 '+ out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh\n' ) if not args.single: (r_path,r_basename,r_ext)=general.parse_filename(reverses[index]) trim_script = general.open_write_file(out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh') trim_script.write('#!/bin/bash\n') # Convert headers if args.convert_header: trim_script.write('# Convert headers:\n') new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq' trim_script.write('cat ' + forwards[index] + convert + new_forward_fastq + '\n') forwards[index] = new_forward_fastq if not args.single: new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq' trim_script.write('cat ' + reverses[index] + convert + new_reverse_fastq + '\n') reverses[index] = new_reverse_fastq # Trim sequences trim_script.write('# Clean reads:\n') if not args.single: trim_script.write(trimmomatic_template.trim_template( forwards[index], reverses[index], args.adapter, out_dir)) else: trim_script.write(trimmomatic_template.trim_template_single(forwards[index])) # Section in progress... (Remember to point to a SE adapter fasta file # by default) trim_script.close() index += 1 qsub_script.close()
def main(): ''' Run full script as opposed to individual script functions. ''' ###################################################################### ############ Get commandline arguments ############ ###################################################################### parser = argparse.ArgumentParser( description='DESCRIPTION: Summarize counts of all four DNA bases. \ Command-line options that may be omitted \ (i.e. are NOT required) are shown in \ square brackets.') parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Runs reporting status updates', default=True) parser.add_argument('-q', '--quiet', action='store_false', dest='verbose', help='Does not report status updates') parser.add_argument('-c', '--colorized', help='Colorizes log reports. Use only if printing \ output to screen.', action='store_true', dest='colorized') parser.add_argument( '-r', '--read_list', dest='read_list', help='This is the the full path (path and filename) of \ the user provided list of read files. The file should \ be tab separated with the first read file, then the \ second read file (see example_read_list_PE.tab). If a \ sample has multiple fastq files for R1 and R2 separate \ these with commas (see example_read_list_PE_multi.tab).\ For single end reads each line should be a path \ to a fastq file. For single end reads each line should \ be a path to a fastq file (see example_read_list_SE.tab\ )', required=True) parser.add_argument( '-p', '--project', dest='project', help='The project id. This will be used to name output \ (default=project).', default='project', required=False) parser.add_argument( '-a', '--adapter', dest='adapter', help='The adapter fasta file. This will be used to \ clean reads', default= '/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa', required=False) parser.add_argument('-s', '--single_end', action='store_true', dest='single', help='If your reads are single end use this flag. \ Without it the script assumes reads are paired end. \ Also skip the second column (the reverse fastq files) \ when making your read list', required=False, default=False) parser.add_argument('-x', '--convert_header', action='store_true', dest='convert_header', help='If the illumina headers \ do not end in /1 or /2 use this parameter to indicat \ that headers need to be converted. Check your headers \ by typing "head FASTA_FULL_PATH" and read more about \ illumina headers at \ http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.', default=False, required=False) parser.add_argument('-m', '--min_read_length', dest='min_read_length', help='The minimum read length in bp. (Default = 90).', required=False, default=90) parser.add_argument('-o', '--out', dest='out', help='Output directory (Default=$HOME)', required=False, default='~') parser.add_argument( '-d', '--dna', dest='sequence', help='DNA sequence to \ summarize', default= 'TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT', required=False) args = parser.parse_args() if args.colorized: import Colorer if args.verbose: doc() log.basicConfig(format='%(levelname)s: %(message)s', level=log.DEBUG) log.info( 'Output is verbose. Run with -q, --quiet flag to suppress full output.' ) else: log.basicConfig(format='%(levelname)s: %(message)s') ###################################################################### ############ Call custom functions with arguments ########### ###################################################################### # Get list of read FASTQ files ####################################### print(args.read_list, args.single, args.min_read_length) (forwards, reverses) = trimmomatic_template.parse_file(args.read_list, args.single) ####################################### # Sanity check read FASTQ files ####################################### index = 0 for fastq in forwards: f_opened_file = general.open_file(forwards[index]) f_opened_file.close() forwards[index] = general.convert_to_full(forwards[index]) if not args.single: r_opened_file = general.open_file(reverses[index]) r_opened_file.close() reverses[index] = general.convert_to_full(reverses[index]) index += 1 ####################################### # Make output directory ####################################### (out_path, out_basename, out_ext) = general.parse_filename(args.out) out_dir = out_path + '/' + out_basename general.path_check(out_dir) # Sanity check directory out_dir = out_dir + '/' + args.project # final out directory is 'project_id' general.mk_out_sub_directory(out_dir) general.mk_out_sub_directory(out_dir + '/scripts') general.mk_out_sub_directory(out_dir + '/qsubs') ####################################### # Write trimmomatic script ####################################### convert = ' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > ' qsub_script = general.open_write_file(out_dir + '/qsubs/qsub_trimmomatic.sh') qsub_script.write('#!/bin/bash\n') index = 0 args.adapter = fasta_o_matic.run_steps( args.adapter, ['wrap', 'new_line', 'header_whitespace']) for fastq in forwards: (f_path, f_basename, f_ext) = general.parse_filename(forwards[index]) qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 ' + out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh\n') if not args.single: (r_path, r_basename, r_ext) = general.parse_filename(reverses[index]) trim_script = general.open_write_file(out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh') trim_script.write('#!/bin/bash\n') # Convert headers if args.convert_header: trim_script.write('# Convert headers:\n') new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq' trim_script.write('cat ' + forwards[index] + convert + new_forward_fastq + '\n') forwards[index] = new_forward_fastq if not args.single: new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq' trim_script.write('cat ' + reverses[index] + convert + new_reverse_fastq + '\n') reverses[index] = new_reverse_fastq # Trim sequences trim_script.write('# Clean reads:\n') if not args.single: trim_script.write( trimmomatic_template.trim_template(forwards[index], reverses[index], args.adapter, out_dir)) else: trim_script.write( trimmomatic_template.trim_template_single(forwards[index])) # Section in progress... (Remember to point to a SE adapter fasta file # by default) trim_script.close() index += 1 qsub_script.close()