def fix_fasta_file(file, out_dir=None): ''' Passes a file through biopython SeqIO to remove common formatting issues like '\r' characters and unwrapped sequences. The new file is saved with the suffix '_clean.fasta'. ''' # Give up early if the file does not look like fasta assert check_header_pattern(file), "Sorry, " + str(file) + " does not look like FASTA to me" suffix = '_clean.fa'; (out_path,out_basename,out_ext) = general.parse_filename(file) if out_dir is not None: os.system("mkdir -p "+out_dir); out_path = out_dir fixed_file = out_path + '/' + out_basename + suffix out_handle = general.open_write_file(fixed_file) fasta_in = SeqIO.parse(file,'fasta'); # Iterate through the records to remove white-space # from the ID line new_records = [] for record in fasta_in: header = re.sub('\s+','_',record.description) record.id = header record.name = header record.description = '' new_records.append(record) written = SeqIO.write(new_records, out_handle,'fasta') print str(written) + ' sequence records stored in ' + fixed_file return(fixed_file)
def fix_headers(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Remove white spaces from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. ''' (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_header = out_path + '/' + out_basename + '_h.fasta' broken_fasta=general.open_file(fasta_file_name) fixed_fasta=general.open_write_file(file_with_header) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line=line.rstrip() if header_pattern.match(line): header = line header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() remove_set = set(['header_whitespace']) # Remove qc step becuase it will be corrected # in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_qc_set_func = checked_qc_set_func.difference(remove_set) # skip finished checks return(file_with_header, qc_set_func, checked_qc_set_func)
def fix_headers(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Remove white spaces from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. ''' (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_header = out_path + '/' + out_basename + '_h.fasta' broken_fasta = general.open_file(fasta_file_name) fixed_fasta = general.open_write_file(file_with_header) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line = line.rstrip() if header_pattern.match(line): header = line header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() remove_set = set(['header_whitespace' ]) # Remove qc step becuase it will be corrected # in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_qc_set_func = checked_qc_set_func.difference( remove_set) # skip finished checks return (file_with_header, qc_set_func, checked_qc_set_func)
def fix_new_line(file, header_whitespace=False, out_dir=None): """ Strips any new line character ('\\n' or '\\r') from each line in file and ends each line (including the last line) with a new line character ('\\n'). """ suffix = "_ended.fasta" if header_whitespace: suffix = "_ended_h.fasta" # make suffix match QC steps taken (out_path, out_basename, out_ext) = general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_new_line = out_path + "/" + out_basename + suffix broken_fasta = general.open_file(file) fixed_fasta = general.open_write_file(file_with_new_line) header_pattern = re.compile("^>.*") header = "" for line in broken_fasta: line = line.rstrip() if header_pattern.match(line): header = line header = re.sub("\s+", "_", header) line = header fixed_fasta.write(line + "\n") fixed_fasta.close() broken_fasta.close() return file_with_new_line
def fix_wrap(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Wraps text in a FASTA file so that no line of sequence has more than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'. ''' suffix = '_wrap.fasta' if 'header_whitespace' in qc_set_func: suffix = '_wrap_h.fasta' (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_wrapping = out_path + '/' + out_basename + suffix fixed_fasta = general.open_write_file(file_with_wrapping) header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) dna = '' for line in infile: line = line.rstrip() if header_pattern.match(line): # Print headers immediately to new file header = line if 'header_whitespace' in qc_set_func: header = re.sub('\s+', '_', header) if dna: fixed_fasta.write(dna + '\n') # print remaining sequence # before header fixed_fasta.write(header + '\n') dna = '' # Reset DNA else: # if the line is sequence data dump sequence as fast as it is # long enough to wrap dna = dna + line while len(dna) > 59: # Wrap sequence lines after # 60 bases wrap_line = dna[0:60] dna = dna[60:len(dna)] fixed_fasta.write(wrap_line + '\n') # Catch the last record else: # For end of file if dna: fixed_fasta.write(dna + '\n') # print remaining sequence # before header fixed_fasta.close() infile.close() remove_set = set(['wrap', 'new_line', 'header_whitespace']) # Remove all three qc steps # becuase all will be corrected in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_remove_set = set(['wrap']) checked_qc_set_func = checked_qc_set_func.difference( checked_remove_set) # skip finished checks return (file_with_wrapping, qc_set_func, checked_qc_set_func)
def fix_wrap(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Wraps text in a FASTA file so that no line of sequence has more than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'. ''' suffix = '_wrap.fasta' if 'header_whitespace' in qc_set_func: suffix = '_wrap_h.fasta' (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_wrapping = out_path + '/' + out_basename + suffix fixed_fasta=general.open_write_file(file_with_wrapping) header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) dna = '' for line in infile: line = line.rstrip() if header_pattern.match(line): # Print headers immediately to new file header = line if 'header_whitespace' in qc_set_func: header = re.sub('\s+', '_', header) if dna: fixed_fasta.write(dna + '\n') # print remaining sequence # before header fixed_fasta.write(header + '\n') dna = '' # Reset DNA else: # if the line is sequence data dump sequence as fast as it is # long enough to wrap dna = dna + line while len(dna) > 59: # Wrap sequence lines after # 60 bases wrap_line = dna[0:60] dna = dna[60:len(dna)] fixed_fasta.write(wrap_line + '\n') # Catch the last record else: # For end of file if dna: fixed_fasta.write(dna + '\n') # print remaining sequence # before header fixed_fasta.close() infile.close() remove_set = set(['wrap','new_line','header_whitespace']) # Remove all three qc steps # becuase all will be corrected in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_remove_set = set(['wrap']) checked_qc_set_func = checked_qc_set_func.difference(checked_remove_set) # skip finished checks return(file_with_wrapping, qc_set_func, checked_qc_set_func)
def fix_wrap(file, header_whitespace=False, out_dir=None): ''' Wraps text in a FASTA file so that no line of sequence has more than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'. ''' suffix = '_wrap.fasta' if header_whitespace: suffix = '_wrap_h.fasta' (out_path,out_basename,out_ext)=general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_wrapping = out_path + '/' + out_basename + suffix fixed_fasta=general.open_write_file(file_with_wrapping) header_pattern = re.compile('^>.*') infile = general.open_file(file) header = ''; dna = ''; records = [] for line in infile: line = line.rstrip() if header_pattern.match(line): if dna: records.append([header,dna]) dna = '' header = line if header_whitespace: header = re.sub('\s+', '_', header) else: dna = dna + line # Catch the last record if dna and header: records.append([header,dna]) for record in records: header, dna = record fixed_fasta.write(header + '\n') wrap = textwrap.fill(dna,60) # Wrap sequence lines after 60 bases fixed_fasta.write(wrap + '\n') fixed_fasta.close() infile.close() return(file_with_wrapping)
def fix_wrap(file, header_whitespace=False, out_dir=None): """ Wraps text in a FASTA file so that no line of sequence has more than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'. """ suffix = "_wrap.fasta" if header_whitespace: suffix = "_wrap_h.fasta" (out_path, out_basename, out_ext) = general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_wrapping = out_path + "/" + out_basename + suffix fixed_fasta = general.open_write_file(file_with_wrapping) header_pattern = re.compile("^>.*") infile = general.open_file(file) dna = "" header = "" for line in infile: line = line.rstrip() if header_pattern.match(line): if not dna == "": # skip the first (empty record) fixed_fasta.write(header + "\n") wrap = textwrap.fill(dna, 60) # Wrap sequence lines after # 60 bases fixed_fasta.write(wrap + "\n") header = line if header_whitespace: header = re.sub("\s+", "_", header) # Gets rid of # whitespace in the headers new_dna = next(infile) new_dna = new_dna.rstrip() dna = new_dna else: dna = dna + line else: # For end of file fixed_fasta.write(header + "\n") wrap = textwrap.fill(dna, 60) # Wrap sequence lines after # 60 bases fixed_fasta.write(wrap + "\n") fixed_fasta.close() infile.close() return file_with_wrapping
def fix_new_line(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Strips any new line character ('\\n' or '\\r') from each line in file and ends each line (including the last line) with a new line character ('\\n'). ''' suffix = '_ended.fasta' if 'header_whitespace' in qc_set_func: suffix = '_ended_h.fasta' # make suffix match QC steps taken (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_new_line = out_path + '/' + out_basename + suffix if sys.version_info > (3, 0): broken_fasta = general.open_file(fasta_file_name) else: broken_fasta = open(fasta_file_name, 'rU') fixed_fasta = general.open_write_file(file_with_new_line) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line = line.rstrip() if header_pattern.match(line): header = line if 'header_whitespace' in qc_set_func: header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() remove_set = set(['new_line', 'header_whitespace']) # Remove both qc steps # becuase they will be corrected in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_remove_set = set(['new_line']) checked_qc_set_func = checked_qc_set_func.difference( checked_remove_set) # skip finished checks return (file_with_new_line, qc_set_func, checked_qc_set_func)
def fix_fasta_file(file, out_dir=None): ''' Passes a file through biopython SeqIO to remove common formatting issues like '\r' characters and unwrapped sequences. The new file is saved with the suffix '_clean.fasta'. ''' # Give up early if the file does not look like fasta assert check_header_pattern( file), "Sorry, " + str(file) + " does not look like FASTA to me" suffix = '_clean.fa' (out_path, out_basename, out_ext) = general.parse_filename(file) if out_dir is not None: os.system("mkdir -p " + out_dir) out_path = out_dir fixed_file = out_path + '/' + out_basename + suffix out_handle = general.open_write_file(fixed_file) fasta_in = SeqIO.parse(file, 'fasta') # Iterate through the records to remove white-space # from the ID line new_records = [] for record in fasta_in: header = re.sub('\s+', '_', record.description) record.id = header record.name = header record.description = '' new_records.append(record) written = SeqIO.write(new_records, out_handle, 'fasta') print str(written) + ' sequence records stored in ' + fixed_file return (fixed_file)
def fix_headers(file, out_dir=None): """ Remove white spaces that break Trimmomatic and some other bioinfo tools from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. """ (out_path, out_basename, out_ext) = general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_header = out_path + "/" + out_basename + "_h.fasta" broken_fasta = general.open_file(file) fixed_fasta = general.open_write_file(file_with_header) header_pattern = re.compile("^>.*") header = "" for line in broken_fasta: line = line.rstrip() if header_pattern.match(line): header = line header = re.sub("\s+", "_", header) line = header fixed_fasta.write(line + "\n") fixed_fasta.close() broken_fasta.close() return file_with_header
def fix_new_line(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Strips any new line character ('\\n' or '\\r') from each line in file and ends each line (including the last line) with a new line character ('\\n'). ''' suffix = '_ended.fasta' if 'header_whitespace' in qc_set_func: suffix = '_ended_h.fasta' # make suffix match QC steps taken (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_new_line = out_path + '/' + out_basename + suffix if sys.version_info > (3, 0): broken_fasta=general.open_file(fasta_file_name) else: broken_fasta = open(fasta_file_name, 'rU') fixed_fasta=general.open_write_file(file_with_new_line) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line=line.rstrip() if header_pattern.match(line): header = line if 'header_whitespace' in qc_set_func: header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() remove_set = set(['new_line','header_whitespace']) # Remove both qc steps # becuase they will be corrected in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_remove_set = set(['new_line']) checked_qc_set_func = checked_qc_set_func.difference(checked_remove_set) # skip finished checks return(file_with_new_line, qc_set_func, checked_qc_set_func)
def fix_headers(file, out_dir=None): ''' Remove white spaces that break Trimmomatic and some other bioinfo tools from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. ''' (out_path,out_basename,out_ext)=general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_header = out_path + '/' + out_basename + '_h.fasta' broken_fasta=general.open_file(file) fixed_fasta=general.open_write_file(file_with_header) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line=line.rstrip() if header_pattern.match(line): header = line header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() return(file_with_header)
def main(): ''' Run full script as opposed to individual script functions. ''' ###################################################################### ############ Get commandline arguments ############ ###################################################################### parser = argparse.ArgumentParser( description='DESCRIPTION: Summarize counts of all four DNA bases. \ Command-line options that may be omitted \ (i.e. are NOT required) are shown in \ square brackets.') parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Runs reporting status updates', default=True) parser.add_argument('-q', '--quiet', action='store_false', dest='verbose', help='Does not report status updates') parser.add_argument('-c', '--colorized', help='Colorizes log reports. Use only if printing \ output to screen.',action='store_true',dest='colorized') parser.add_argument('-r', '--read_list', dest='read_list', help='This is the the full path (path and filename) of \ the user provided list of read files. The file should \ be tab separated with the first read file, then the \ second read file (see example_read_list_PE.tab). If a \ sample has multiple fastq files for R1 and R2 separate \ these with commas (see example_read_list_PE_multi.tab).\ For single end reads each line should be a path \ to a fastq file. For single end reads each line should \ be a path to a fastq file (see example_read_list_SE.tab\ )', required=True) parser.add_argument('-p', '--project', dest='project', help='The project id. This will be used to name output \ (default=project).', default='project', required=False) parser.add_argument('-a', '--adapter', dest='adapter', help='The adapter fasta file. This will be used to \ clean reads',default='/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa', required=False) parser.add_argument('-s', '--single_end', action='store_true', dest='single', help='If your reads are single end use this flag. \ Without it the script assumes reads are paired end. \ Also skip the second column (the reverse fastq files) \ when making your read list', required=False, default=False) parser.add_argument('-x', '--convert_header', action='store_true', dest='convert_header', help='If the illumina headers \ do not end in /1 or /2 use this parameter to indicat \ that headers need to be converted. Check your headers \ by typing "head FASTA_FULL_PATH" and read more about \ illumina headers at \ http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.', default=False, required=False) parser.add_argument('-m', '--min_read_length', dest='min_read_length', help='The minimum read length in bp. (Default = 90).', required=False, default=90) parser.add_argument('-o', '--out', dest='out', help='Output directory (Default=$HOME)', required=False, default='~') parser.add_argument('-d', '--dna', dest='sequence', help='DNA sequence to \ summarize', default='TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT', required=False) args = parser.parse_args() if args.colorized: import Colorer if args.verbose: doc() log.basicConfig(format='%(levelname)s: %(message)s', level=log.DEBUG) log.info('Output is verbose. Run with -q, --quiet flag to suppress full output.') else: log.basicConfig(format='%(levelname)s: %(message)s') ###################################################################### ############ Call custom functions with arguments ########### ###################################################################### # Get list of read FASTQ files ####################################### print(args.read_list, args.single, args.min_read_length) (forwards,reverses) = trimmomatic_template.parse_file(args.read_list, args.single) ####################################### # Sanity check read FASTQ files ####################################### index = 0 for fastq in forwards: f_opened_file=general.open_file(forwards[index]) f_opened_file.close() forwards[index] = general.convert_to_full(forwards[index]) if not args.single: r_opened_file=general.open_file(reverses[index]) r_opened_file.close() reverses[index] = general.convert_to_full(reverses[index]) index += 1 ####################################### # Make output directory ####################################### (out_path,out_basename,out_ext)=general.parse_filename(args.out) out_dir=out_path + '/' + out_basename general.path_check(out_dir) # Sanity check directory out_dir= out_dir + '/' + args.project # final out directory is 'project_id' general.mk_out_sub_directory(out_dir) general.mk_out_sub_directory(out_dir + '/scripts') general.mk_out_sub_directory(out_dir + '/qsubs') ####################################### # Write trimmomatic script ####################################### convert=' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > ' qsub_script = general.open_write_file(out_dir + '/qsubs/qsub_trimmomatic.sh') qsub_script.write('#!/bin/bash\n') index=0 args.adapter = fasta_o_matic.run_steps(args.adapter,['wrap', 'new_line','header_whitespace']) for fastq in forwards: (f_path,f_basename,f_ext)=general.parse_filename(forwards[index]) qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 '+ out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh\n' ) if not args.single: (r_path,r_basename,r_ext)=general.parse_filename(reverses[index]) trim_script = general.open_write_file(out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh') trim_script.write('#!/bin/bash\n') # Convert headers if args.convert_header: trim_script.write('# Convert headers:\n') new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq' trim_script.write('cat ' + forwards[index] + convert + new_forward_fastq + '\n') forwards[index] = new_forward_fastq if not args.single: new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq' trim_script.write('cat ' + reverses[index] + convert + new_reverse_fastq + '\n') reverses[index] = new_reverse_fastq # Trim sequences trim_script.write('# Clean reads:\n') if not args.single: trim_script.write(trimmomatic_template.trim_template( forwards[index], reverses[index], args.adapter, out_dir)) else: trim_script.write(trimmomatic_template.trim_template_single(forwards[index])) # Section in progress... (Remember to point to a SE adapter fasta file # by default) trim_script.close() index += 1 qsub_script.close()
def main(): ''' Run full script as opposed to individual script functions. ''' ###################################################################### ############ Get commandline arguments ############ ###################################################################### parser = argparse.ArgumentParser( description='DESCRIPTION: Summarize counts of all four DNA bases. \ Command-line options that may be omitted \ (i.e. are NOT required) are shown in \ square brackets.') parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Runs reporting status updates', default=True) parser.add_argument('-q', '--quiet', action='store_false', dest='verbose', help='Does not report status updates') parser.add_argument('-c', '--colorized', help='Colorizes log reports. Use only if printing \ output to screen.', action='store_true', dest='colorized') parser.add_argument( '-r', '--read_list', dest='read_list', help='This is the the full path (path and filename) of \ the user provided list of read files. The file should \ be tab separated with the first read file, then the \ second read file (see example_read_list_PE.tab). If a \ sample has multiple fastq files for R1 and R2 separate \ these with commas (see example_read_list_PE_multi.tab).\ For single end reads each line should be a path \ to a fastq file. For single end reads each line should \ be a path to a fastq file (see example_read_list_SE.tab\ )', required=True) parser.add_argument( '-p', '--project', dest='project', help='The project id. This will be used to name output \ (default=project).', default='project', required=False) parser.add_argument( '-a', '--adapter', dest='adapter', help='The adapter fasta file. This will be used to \ clean reads', default= '/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa', required=False) parser.add_argument('-s', '--single_end', action='store_true', dest='single', help='If your reads are single end use this flag. \ Without it the script assumes reads are paired end. \ Also skip the second column (the reverse fastq files) \ when making your read list', required=False, default=False) parser.add_argument('-x', '--convert_header', action='store_true', dest='convert_header', help='If the illumina headers \ do not end in /1 or /2 use this parameter to indicat \ that headers need to be converted. Check your headers \ by typing "head FASTA_FULL_PATH" and read more about \ illumina headers at \ http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.', default=False, required=False) parser.add_argument('-m', '--min_read_length', dest='min_read_length', help='The minimum read length in bp. (Default = 90).', required=False, default=90) parser.add_argument('-o', '--out', dest='out', help='Output directory (Default=$HOME)', required=False, default='~') parser.add_argument( '-d', '--dna', dest='sequence', help='DNA sequence to \ summarize', default= 'TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT', required=False) args = parser.parse_args() if args.colorized: import Colorer if args.verbose: doc() log.basicConfig(format='%(levelname)s: %(message)s', level=log.DEBUG) log.info( 'Output is verbose. Run with -q, --quiet flag to suppress full output.' ) else: log.basicConfig(format='%(levelname)s: %(message)s') ###################################################################### ############ Call custom functions with arguments ########### ###################################################################### # Get list of read FASTQ files ####################################### print(args.read_list, args.single, args.min_read_length) (forwards, reverses) = trimmomatic_template.parse_file(args.read_list, args.single) ####################################### # Sanity check read FASTQ files ####################################### index = 0 for fastq in forwards: f_opened_file = general.open_file(forwards[index]) f_opened_file.close() forwards[index] = general.convert_to_full(forwards[index]) if not args.single: r_opened_file = general.open_file(reverses[index]) r_opened_file.close() reverses[index] = general.convert_to_full(reverses[index]) index += 1 ####################################### # Make output directory ####################################### (out_path, out_basename, out_ext) = general.parse_filename(args.out) out_dir = out_path + '/' + out_basename general.path_check(out_dir) # Sanity check directory out_dir = out_dir + '/' + args.project # final out directory is 'project_id' general.mk_out_sub_directory(out_dir) general.mk_out_sub_directory(out_dir + '/scripts') general.mk_out_sub_directory(out_dir + '/qsubs') ####################################### # Write trimmomatic script ####################################### convert = ' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > ' qsub_script = general.open_write_file(out_dir + '/qsubs/qsub_trimmomatic.sh') qsub_script.write('#!/bin/bash\n') index = 0 args.adapter = fasta_o_matic.run_steps( args.adapter, ['wrap', 'new_line', 'header_whitespace']) for fastq in forwards: (f_path, f_basename, f_ext) = general.parse_filename(forwards[index]) qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 ' + out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh\n') if not args.single: (r_path, r_basename, r_ext) = general.parse_filename(reverses[index]) trim_script = general.open_write_file(out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh') trim_script.write('#!/bin/bash\n') # Convert headers if args.convert_header: trim_script.write('# Convert headers:\n') new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq' trim_script.write('cat ' + forwards[index] + convert + new_forward_fastq + '\n') forwards[index] = new_forward_fastq if not args.single: new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq' trim_script.write('cat ' + reverses[index] + convert + new_reverse_fastq + '\n') reverses[index] = new_reverse_fastq # Trim sequences trim_script.write('# Clean reads:\n') if not args.single: trim_script.write( trimmomatic_template.trim_template(forwards[index], reverses[index], args.adapter, out_dir)) else: trim_script.write( trimmomatic_template.trim_template_single(forwards[index])) # Section in progress... (Remember to point to a SE adapter fasta file # by default) trim_script.close() index += 1 qsub_script.close()