def fix_headers(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Remove white spaces from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. ''' (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_header = out_path + '/' + out_basename + '_h.fasta' broken_fasta=general.open_file(fasta_file_name) fixed_fasta=general.open_write_file(file_with_header) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line=line.rstrip() if header_pattern.match(line): header = line header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() remove_set = set(['header_whitespace']) # Remove qc step becuase it will be corrected # in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_qc_set_func = checked_qc_set_func.difference(remove_set) # skip finished checks return(file_with_header, qc_set_func, checked_qc_set_func)
def fix_new_line(file, header_whitespace=False, out_dir=None): """ Strips any new line character ('\\n' or '\\r') from each line in file and ends each line (including the last line) with a new line character ('\\n'). """ suffix = "_ended.fasta" if header_whitespace: suffix = "_ended_h.fasta" # make suffix match QC steps taken (out_path, out_basename, out_ext) = general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_new_line = out_path + "/" + out_basename + suffix broken_fasta = general.open_file(file) fixed_fasta = general.open_write_file(file_with_new_line) header_pattern = re.compile("^>.*") header = "" for line in broken_fasta: line = line.rstrip() if header_pattern.match(line): header = line header = re.sub("\s+", "_", header) line = header fixed_fasta.write(line + "\n") fixed_fasta.close() broken_fasta.close() return file_with_new_line
def check_wrap(fasta_file_name): ''' Returns True if the none of the sequence lines in a FASTA file exceed 80 characters (this should be true if the FASTA file is wrapped). Returns False if one or more line of sequence exceeds 80 characters. Wrapped file is saved with the suffix '_wrap.fasta'. ''' header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) lengths = [] lengths_OK = False for line in infile: line = line.rstrip() # Check if all but last line are equal length if header_pattern.match(line): if len(lengths) > 2: # If multiple lines remain to compare lengths_OK = compare_lengths(lengths) if not lengths_OK: return False lengths = [] # Append to list and check if all sequence lines are < 80 else: if len(line) > 80: # exit when you hit a sequence line > 80 return False seq_length = len(line) lengths.append(seq_length) # One last set to evaluate after you fall off the end of the loop if len(lengths) > 2: lengths_OK = compare_lengths(lengths) return lengths_OK
def fix_headers(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Remove white spaces from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. ''' (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_header = out_path + '/' + out_basename + '_h.fasta' broken_fasta = general.open_file(fasta_file_name) fixed_fasta = general.open_write_file(file_with_header) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line = line.rstrip() if header_pattern.match(line): header = line header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() remove_set = set(['header_whitespace' ]) # Remove qc step becuase it will be corrected # in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_qc_set_func = checked_qc_set_func.difference( remove_set) # skip finished checks return (file_with_header, qc_set_func, checked_qc_set_func)
def parse_file(read_list_file, single): ''' Parse input read_list file. Check that it has the correct number of tab delimited columns and get list of forwards and/or reverses. ''' forwards=[] reverses=[] input = general.open_file(read_list_file) if single: for line in input: forwards = (re.split(',', line)) return(forwards,False) else: try: for line in input: line = (line.rstrip()) if line: (forward,reverse) = (re.split('\s+', line)) forwards = (re.split(',', forward)) reverses = (re.split(',', reverse)) if not len(forwards) == len(reverses): log.error('Exiting because the number of forward read FASTQ files does not equal the number of reverse read FASTQ files. This may indicate that your read list is not properly formatted. It could indicate that you should use the -s --single flag to for single end Illumina reads or make sure to separate you comma separated list of forward and reverse reads with a single tab in your plain text read_list file.') sys.exit(0) else: return(forwards,reverses) except ValueError as e: log.error('"%(e)s... Use the -s --single flag to indicate single end Illumina reads or make sure to separate you comma separated list of foward and reverse reads with a single tab in your plain text read_list file."' % locals()) sys.exit(0) input.close()
def check_headers(fasta_file_name): ''' Check if FASTA headers contain white spaces that break Trimmomatic and some other bioinfo tools. Return True if header has spaces. Returns False if header has no spaces. ''' header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) for line in infile: line = line.rstrip() if header_pattern.match(line): if re.match('.*\s.*', line): return(False) return(True)
def check_header_pattern(file): ''' Check if FASTA file begins with a '>'. Returns True if the first line begins with a '>'. Returns False if the file starts with any other character. ''' header_pattern = re.compile('^>.*') infile = general.open_file(file) first_line = infile.readline() infile.close() if header_pattern.match(first_line): return(True) else: return(False)
def check_header_pattern(file): """ Check if FASTA file begins with a '>'. Returns True if the first line is begins with a '>'. Returns False if the file starts with any other character. """ header_pattern = re.compile("^>.*") infile = general.open_file(file) first_line = infile.readline() infile.close() if header_pattern.match(first_line): return True else: return False
def fix_wrap(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Wraps text in a FASTA file so that no line of sequence has more than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'. ''' suffix = '_wrap.fasta' if 'header_whitespace' in qc_set_func: suffix = '_wrap_h.fasta' (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_wrapping = out_path + '/' + out_basename + suffix fixed_fasta = general.open_write_file(file_with_wrapping) header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) dna = '' for line in infile: line = line.rstrip() if header_pattern.match(line): # Print headers immediately to new file header = line if 'header_whitespace' in qc_set_func: header = re.sub('\s+', '_', header) if dna: fixed_fasta.write(dna + '\n') # print remaining sequence # before header fixed_fasta.write(header + '\n') dna = '' # Reset DNA else: # if the line is sequence data dump sequence as fast as it is # long enough to wrap dna = dna + line while len(dna) > 59: # Wrap sequence lines after # 60 bases wrap_line = dna[0:60] dna = dna[60:len(dna)] fixed_fasta.write(wrap_line + '\n') # Catch the last record else: # For end of file if dna: fixed_fasta.write(dna + '\n') # print remaining sequence # before header fixed_fasta.close() infile.close() remove_set = set(['wrap', 'new_line', 'header_whitespace']) # Remove all three qc steps # becuase all will be corrected in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_remove_set = set(['wrap']) checked_qc_set_func = checked_qc_set_func.difference( checked_remove_set) # skip finished checks return (file_with_wrapping, qc_set_func, checked_qc_set_func)
def check_headers(file): """ Check if FASTA headers contain white spaces that break Trimmomatic and some other bioinfo tools. Return True if header has spaces. Returns False if header has no spaces. """ header_pattern = re.compile("^>.*") infile = general.open_file(file) for line in infile: line = line.rstrip() if header_pattern.match(line): if re.match(".*\s.*", line): return False return True
def check_header_pattern(fasta_file_name): ''' Check if FASTA file begins with a '>'. Returns True if the first line is begins with a '>'. Returns False if the file starts with any other character. ''' header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) first_line = infile.readline() infile.close() if header_pattern.match(first_line): return (True) else: return (False)
def check_headers(fasta_file_name): ''' Check if FASTA headers contain white spaces that break Trimmomatic and some other bioinfo tools. Return True if header has spaces. Returns False if header has no spaces. ''' header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) for line in infile: line = line.rstrip() if header_pattern.match(line): if re.match('.*\s.*', line): return (False) return (True)
def fix_wrap(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Wraps text in a FASTA file so that no line of sequence has more than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'. ''' suffix = '_wrap.fasta' if 'header_whitespace' in qc_set_func: suffix = '_wrap_h.fasta' (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_wrapping = out_path + '/' + out_basename + suffix fixed_fasta=general.open_write_file(file_with_wrapping) header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) dna = '' for line in infile: line = line.rstrip() if header_pattern.match(line): # Print headers immediately to new file header = line if 'header_whitespace' in qc_set_func: header = re.sub('\s+', '_', header) if dna: fixed_fasta.write(dna + '\n') # print remaining sequence # before header fixed_fasta.write(header + '\n') dna = '' # Reset DNA else: # if the line is sequence data dump sequence as fast as it is # long enough to wrap dna = dna + line while len(dna) > 59: # Wrap sequence lines after # 60 bases wrap_line = dna[0:60] dna = dna[60:len(dna)] fixed_fasta.write(wrap_line + '\n') # Catch the last record else: # For end of file if dna: fixed_fasta.write(dna + '\n') # print remaining sequence # before header fixed_fasta.close() infile.close() remove_set = set(['wrap','new_line','header_whitespace']) # Remove all three qc steps # becuase all will be corrected in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_remove_set = set(['wrap']) checked_qc_set_func = checked_qc_set_func.difference(checked_remove_set) # skip finished checks return(file_with_wrapping, qc_set_func, checked_qc_set_func)
def check_new_line(file): """ Returns True if the last line in a FASTA file ends in the standard new line character ('\\n'). Returns False if not. Test also fails if the sequence lines ends in the less common '\\r' character. """ infile = general.open_file(file) last_char = "" for line in infile: last_char = line[-1] # grab the last character infile.close() if last_char == "\n": # test the final last character return True else: return False
def check_iupac(fasta_file_name): ''' Check if FASTA file contains non-IUPAC characters in sequence lines. Returns false if non-IUPAC characters are found and True if non are found. ''' iupac_set = set(['a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' , 'h' , 'i' , 'k' , 'l' , 'm' , 'n' , 'o' , 'p' , 'q' , 'r' , 's' , 't' , 'u' , 'v' , 'w' , 'x' , 'y' , 'A' , 'B' , 'C' , 'D' , 'E' , 'F' , 'G' , 'H' , 'I' , 'K' , 'L' , 'M' , 'N' , 'O', 'P' , 'Q' , 'R' , 'S' , 'T' , 'U' , 'V' , 'W' , 'X' , 'Y' , '-' , '*']) header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) for line in infile: if not header_pattern.match(line): line = line.rstrip() for char in line: if not char in iupac_set: # check each character against IUPAC set log.error('\tError: %(char)s in sequence line' % locals()) return(False) return(True)
def fix_wrap(file, header_whitespace=False, out_dir=None): ''' Wraps text in a FASTA file so that no line of sequence has more than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'. ''' suffix = '_wrap.fasta' if header_whitespace: suffix = '_wrap_h.fasta' (out_path,out_basename,out_ext)=general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_wrapping = out_path + '/' + out_basename + suffix fixed_fasta=general.open_write_file(file_with_wrapping) header_pattern = re.compile('^>.*') infile = general.open_file(file) header = ''; dna = ''; records = [] for line in infile: line = line.rstrip() if header_pattern.match(line): if dna: records.append([header,dna]) dna = '' header = line if header_whitespace: header = re.sub('\s+', '_', header) else: dna = dna + line # Catch the last record if dna and header: records.append([header,dna]) for record in records: header, dna = record fixed_fasta.write(header + '\n') wrap = textwrap.fill(dna,60) # Wrap sequence lines after 60 bases fixed_fasta.write(wrap + '\n') fixed_fasta.close() infile.close() return(file_with_wrapping)
def fix_wrap(file, header_whitespace=False, out_dir=None): """ Wraps text in a FASTA file so that no line of sequence has more than 60 bases. Wrapped file is saved with the suffix '_wrap.fasta'. """ suffix = "_wrap.fasta" if header_whitespace: suffix = "_wrap_h.fasta" (out_path, out_basename, out_ext) = general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_wrapping = out_path + "/" + out_basename + suffix fixed_fasta = general.open_write_file(file_with_wrapping) header_pattern = re.compile("^>.*") infile = general.open_file(file) dna = "" header = "" for line in infile: line = line.rstrip() if header_pattern.match(line): if not dna == "": # skip the first (empty record) fixed_fasta.write(header + "\n") wrap = textwrap.fill(dna, 60) # Wrap sequence lines after # 60 bases fixed_fasta.write(wrap + "\n") header = line if header_whitespace: header = re.sub("\s+", "_", header) # Gets rid of # whitespace in the headers new_dna = next(infile) new_dna = new_dna.rstrip() dna = new_dna else: dna = dna + line else: # For end of file fixed_fasta.write(header + "\n") wrap = textwrap.fill(dna, 60) # Wrap sequence lines after # 60 bases fixed_fasta.write(wrap + "\n") fixed_fasta.close() infile.close() return file_with_wrapping
def check_wrap(file): """ Returns True if the none of the sequence lines in a FASTA file exceed 80 characters (this should be true if the FASTA file is wrapped). Returns False if one or more line of sequence exceeds 80 characters. Wrapped file is saved with the suffix '_wrap.fasta'. """ header_pattern = re.compile("^>.*") infile = general.open_file(file) lengths = [] wrap_length = None for line in infile: line = line.rstrip() # Check if all but last line are equal length if header_pattern.match(line): if len(lengths) > 2: # If multiple lines remain to compare if wrap_length is None: wrap_length = lengths[0] # initialize wrapping length lengths.pop() # Remove the last sequence line for seq_line in lengths: if seq_line != wrap_length: return False # Exit when you hit mismatched wrapped lines lengths = [] # Check if all sequence lines are < 80 if not header_pattern.match(line): if len(line) > 80: # exit when you hit a sequence line > 80 return False seq_length = len(line) lengths.append(seq_length) else: # For end of file if len(lengths) > 2: # If multiple lines remain to compare if wrap_length is None: wrap_length = lengths[0] # initialize wrapping length lengths.pop() # Remove the last sequence line for seq_line in lengths: if seq_line != wrap_length: return False # Exit when you hit mismatched wrapped lines return True
def fix_new_line(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Strips any new line character ('\\n' or '\\r') from each line in file and ends each line (including the last line) with a new line character ('\\n'). ''' suffix = '_ended.fasta' if 'header_whitespace' in qc_set_func: suffix = '_ended_h.fasta' # make suffix match QC steps taken (out_path, out_basename, out_ext) = general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_new_line = out_path + '/' + out_basename + suffix if sys.version_info > (3, 0): broken_fasta = general.open_file(fasta_file_name) else: broken_fasta = open(fasta_file_name, 'rU') fixed_fasta = general.open_write_file(file_with_new_line) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line = line.rstrip() if header_pattern.match(line): header = line if 'header_whitespace' in qc_set_func: header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() remove_set = set(['new_line', 'header_whitespace']) # Remove both qc steps # becuase they will be corrected in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_remove_set = set(['new_line']) checked_qc_set_func = checked_qc_set_func.difference( checked_remove_set) # skip finished checks return (file_with_new_line, qc_set_func, checked_qc_set_func)
def get_count(fasta_file_name): ''' Takes a FASTA file path and returns the number of lines and the number of sequences. ''' newline_pattern = re.compile('.*\n') header_pattern = re.compile('^>.*') line_count = 0 header_count = 0 if sys.version_info > (3, 0): # Open without automatically converting # newlines to standard Unix newlines for python3.3+ fasta_file = open(fasta_file_name, 'r', newline='') else: fasta_file = general.open_file(fasta_file_name) for line in fasta_file: if newline_pattern.match(line): line_count = line_count + 1 line = line.rstrip() if header_pattern.match(line): header_count = header_count + 1 fasta_file.close() return(line_count,header_count)
def check_unique(fasta_file_name): ''' Check if FASTA headers have unique first words. Returns True if header first words are unique. Returns False if header first words are not and cannot be made unique automatically. ''' first_word_set = set() header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) for line in infile: line = line.rstrip() if header_pattern.match(line): if re.match('^>(\S+)',line): # grab first word in description word = re.match('^>(\S+)',line) # grab first word in description current_word = word.group(1) if not current_word in first_word_set: first_word_set.add(current_word) else: return(False) # you have seen this first word before! else: return(False) # Blank headers can't pass a test for uniqueness return(True)
def check_iupac(fasta_file_name): ''' Check if FASTA file contains non-IUPAC characters in sequence lines. Returns false if non-IUPAC characters are found and True if non are found. ''' iupac_set = set([ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', '-', '*' ]) header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) for line in infile: if not header_pattern.match(line): line = line.rstrip() for char in line: if not char in iupac_set: # check each character against IUPAC set log.error('\tError: %(char)s in sequence line' % locals()) return (False) return (True)
def fix_headers(file, out_dir=None): """ Remove white spaces that break Trimmomatic and some other bioinfo tools from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. """ (out_path, out_basename, out_ext) = general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_header = out_path + "/" + out_basename + "_h.fasta" broken_fasta = general.open_file(file) fixed_fasta = general.open_write_file(file_with_header) header_pattern = re.compile("^>.*") header = "" for line in broken_fasta: line = line.rstrip() if header_pattern.match(line): header = line header = re.sub("\s+", "_", header) line = header fixed_fasta.write(line + "\n") fixed_fasta.close() broken_fasta.close() return file_with_header
def check_unique(fasta_file_name): ''' Check if FASTA headers have unique first words. Returns True if header first words are unique. Returns False if header first words are not and cannot be made unique automatically. ''' first_word_set = set() header_pattern = re.compile('^>.*') infile = general.open_file(fasta_file_name) for line in infile: line = line.rstrip() if header_pattern.match(line): if re.match('^>(\S+)', line): # grab first word in description word = re.match('^>(\S+)', line) # grab first word in description current_word = word.group(1) if not current_word in first_word_set: first_word_set.add(current_word) else: return (False) # you have seen this first word before! else: return (False ) # Blank headers can't pass a test for uniqueness return (True)
def fix_new_line(fasta_file_name, qc_set_func, checked_qc_set_func, out_dir=None): ''' Strips any new line character ('\\n' or '\\r') from each line in file and ends each line (including the last line) with a new line character ('\\n'). ''' suffix = '_ended.fasta' if 'header_whitespace' in qc_set_func: suffix = '_ended_h.fasta' # make suffix match QC steps taken (out_path,out_basename,out_ext)=general.parse_filename(fasta_file_name) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_new_line = out_path + '/' + out_basename + suffix if sys.version_info > (3, 0): broken_fasta=general.open_file(fasta_file_name) else: broken_fasta = open(fasta_file_name, 'rU') fixed_fasta=general.open_write_file(file_with_new_line) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line=line.rstrip() if header_pattern.match(line): header = line if 'header_whitespace' in qc_set_func: header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() remove_set = set(['new_line','header_whitespace']) # Remove both qc steps # becuase they will be corrected in the final FASTA file qc_set_func = qc_set_func.difference(remove_set) # skip finished repairs checked_remove_set = set(['new_line']) checked_qc_set_func = checked_qc_set_func.difference(checked_remove_set) # skip finished checks return(file_with_new_line, qc_set_func, checked_qc_set_func)
def fix_headers(file, out_dir=None): ''' Remove white spaces that break Trimmomatic and some other bioinfo tools from the headers of a FASTA file. Fixed FASTA file is saved with the suffix '_h.fasta'. ''' (out_path,out_basename,out_ext)=general.parse_filename(file) if out_dir is not None: out_path = out_dir # switch to user specified output directory file_with_header = out_path + '/' + out_basename + '_h.fasta' broken_fasta=general.open_file(file) fixed_fasta=general.open_write_file(file_with_header) header_pattern = re.compile('^>.*') header = '' for line in broken_fasta: line=line.rstrip() if header_pattern.match(line): header = line header = re.sub('\s+', '_', header) line = header fixed_fasta.write(line + '\n') fixed_fasta.close() broken_fasta.close() return(file_with_header)
def main(): ''' Run full script as opposed to individual script functions. ''' ###################################################################### ############ Get commandline arguments ############ ###################################################################### parser = argparse.ArgumentParser( description='DESCRIPTION: Summarize counts of all four DNA bases. \ Command-line options that may be omitted \ (i.e. are NOT required) are shown in \ square brackets.') parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Runs reporting status updates', default=True) parser.add_argument('-q', '--quiet', action='store_false', dest='verbose', help='Does not report status updates') parser.add_argument('-c', '--colorized', help='Colorizes log reports. Use only if printing \ output to screen.',action='store_true',dest='colorized') parser.add_argument('-r', '--read_list', dest='read_list', help='This is the the full path (path and filename) of \ the user provided list of read files. The file should \ be tab separated with the first read file, then the \ second read file (see example_read_list_PE.tab). If a \ sample has multiple fastq files for R1 and R2 separate \ these with commas (see example_read_list_PE_multi.tab).\ For single end reads each line should be a path \ to a fastq file. For single end reads each line should \ be a path to a fastq file (see example_read_list_SE.tab\ )', required=True) parser.add_argument('-p', '--project', dest='project', help='The project id. This will be used to name output \ (default=project).', default='project', required=False) parser.add_argument('-a', '--adapter', dest='adapter', help='The adapter fasta file. This will be used to \ clean reads',default='/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa', required=False) parser.add_argument('-s', '--single_end', action='store_true', dest='single', help='If your reads are single end use this flag. \ Without it the script assumes reads are paired end. \ Also skip the second column (the reverse fastq files) \ when making your read list', required=False, default=False) parser.add_argument('-x', '--convert_header', action='store_true', dest='convert_header', help='If the illumina headers \ do not end in /1 or /2 use this parameter to indicat \ that headers need to be converted. Check your headers \ by typing "head FASTA_FULL_PATH" and read more about \ illumina headers at \ http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.', default=False, required=False) parser.add_argument('-m', '--min_read_length', dest='min_read_length', help='The minimum read length in bp. (Default = 90).', required=False, default=90) parser.add_argument('-o', '--out', dest='out', help='Output directory (Default=$HOME)', required=False, default='~') parser.add_argument('-d', '--dna', dest='sequence', help='DNA sequence to \ summarize', default='TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT', required=False) args = parser.parse_args() if args.colorized: import Colorer if args.verbose: doc() log.basicConfig(format='%(levelname)s: %(message)s', level=log.DEBUG) log.info('Output is verbose. Run with -q, --quiet flag to suppress full output.') else: log.basicConfig(format='%(levelname)s: %(message)s') ###################################################################### ############ Call custom functions with arguments ########### ###################################################################### # Get list of read FASTQ files ####################################### print(args.read_list, args.single, args.min_read_length) (forwards,reverses) = trimmomatic_template.parse_file(args.read_list, args.single) ####################################### # Sanity check read FASTQ files ####################################### index = 0 for fastq in forwards: f_opened_file=general.open_file(forwards[index]) f_opened_file.close() forwards[index] = general.convert_to_full(forwards[index]) if not args.single: r_opened_file=general.open_file(reverses[index]) r_opened_file.close() reverses[index] = general.convert_to_full(reverses[index]) index += 1 ####################################### # Make output directory ####################################### (out_path,out_basename,out_ext)=general.parse_filename(args.out) out_dir=out_path + '/' + out_basename general.path_check(out_dir) # Sanity check directory out_dir= out_dir + '/' + args.project # final out directory is 'project_id' general.mk_out_sub_directory(out_dir) general.mk_out_sub_directory(out_dir + '/scripts') general.mk_out_sub_directory(out_dir + '/qsubs') ####################################### # Write trimmomatic script ####################################### convert=' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > ' qsub_script = general.open_write_file(out_dir + '/qsubs/qsub_trimmomatic.sh') qsub_script.write('#!/bin/bash\n') index=0 args.adapter = fasta_o_matic.run_steps(args.adapter,['wrap', 'new_line','header_whitespace']) for fastq in forwards: (f_path,f_basename,f_ext)=general.parse_filename(forwards[index]) qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 '+ out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh\n' ) if not args.single: (r_path,r_basename,r_ext)=general.parse_filename(reverses[index]) trim_script = general.open_write_file(out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh') trim_script.write('#!/bin/bash\n') # Convert headers if args.convert_header: trim_script.write('# Convert headers:\n') new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq' trim_script.write('cat ' + forwards[index] + convert + new_forward_fastq + '\n') forwards[index] = new_forward_fastq if not args.single: new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq' trim_script.write('cat ' + reverses[index] + convert + new_reverse_fastq + '\n') reverses[index] = new_reverse_fastq # Trim sequences trim_script.write('# Clean reads:\n') if not args.single: trim_script.write(trimmomatic_template.trim_template( forwards[index], reverses[index], args.adapter, out_dir)) else: trim_script.write(trimmomatic_template.trim_template_single(forwards[index])) # Section in progress... (Remember to point to a SE adapter fasta file # by default) trim_script.close() index += 1 qsub_script.close()
def main(): ''' Run full script as opposed to individual script functions. ''' ###################################################################### ############ Get commandline arguments ############ ###################################################################### parser = argparse.ArgumentParser( description='DESCRIPTION: Summarize counts of all four DNA bases. \ Command-line options that may be omitted \ (i.e. are NOT required) are shown in \ square brackets.') parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Runs reporting status updates', default=True) parser.add_argument('-q', '--quiet', action='store_false', dest='verbose', help='Does not report status updates') parser.add_argument('-c', '--colorized', help='Colorizes log reports. Use only if printing \ output to screen.', action='store_true', dest='colorized') parser.add_argument( '-r', '--read_list', dest='read_list', help='This is the the full path (path and filename) of \ the user provided list of read files. The file should \ be tab separated with the first read file, then the \ second read file (see example_read_list_PE.tab). If a \ sample has multiple fastq files for R1 and R2 separate \ these with commas (see example_read_list_PE_multi.tab).\ For single end reads each line should be a path \ to a fastq file. For single end reads each line should \ be a path to a fastq file (see example_read_list_SE.tab\ )', required=True) parser.add_argument( '-p', '--project', dest='project', help='The project id. This will be used to name output \ (default=project).', default='project', required=False) parser.add_argument( '-a', '--adapter', dest='adapter', help='The adapter fasta file. This will be used to \ clean reads', default= '/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa', required=False) parser.add_argument('-s', '--single_end', action='store_true', dest='single', help='If your reads are single end use this flag. \ Without it the script assumes reads are paired end. \ Also skip the second column (the reverse fastq files) \ when making your read list', required=False, default=False) parser.add_argument('-x', '--convert_header', action='store_true', dest='convert_header', help='If the illumina headers \ do not end in /1 or /2 use this parameter to indicat \ that headers need to be converted. Check your headers \ by typing "head FASTA_FULL_PATH" and read more about \ illumina headers at \ http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.', default=False, required=False) parser.add_argument('-m', '--min_read_length', dest='min_read_length', help='The minimum read length in bp. (Default = 90).', required=False, default=90) parser.add_argument('-o', '--out', dest='out', help='Output directory (Default=$HOME)', required=False, default='~') parser.add_argument( '-d', '--dna', dest='sequence', help='DNA sequence to \ summarize', default= 'TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT', required=False) args = parser.parse_args() if args.colorized: import Colorer if args.verbose: doc() log.basicConfig(format='%(levelname)s: %(message)s', level=log.DEBUG) log.info( 'Output is verbose. Run with -q, --quiet flag to suppress full output.' ) else: log.basicConfig(format='%(levelname)s: %(message)s') ###################################################################### ############ Call custom functions with arguments ########### ###################################################################### # Get list of read FASTQ files ####################################### print(args.read_list, args.single, args.min_read_length) (forwards, reverses) = trimmomatic_template.parse_file(args.read_list, args.single) ####################################### # Sanity check read FASTQ files ####################################### index = 0 for fastq in forwards: f_opened_file = general.open_file(forwards[index]) f_opened_file.close() forwards[index] = general.convert_to_full(forwards[index]) if not args.single: r_opened_file = general.open_file(reverses[index]) r_opened_file.close() reverses[index] = general.convert_to_full(reverses[index]) index += 1 ####################################### # Make output directory ####################################### (out_path, out_basename, out_ext) = general.parse_filename(args.out) out_dir = out_path + '/' + out_basename general.path_check(out_dir) # Sanity check directory out_dir = out_dir + '/' + args.project # final out directory is 'project_id' general.mk_out_sub_directory(out_dir) general.mk_out_sub_directory(out_dir + '/scripts') general.mk_out_sub_directory(out_dir + '/qsubs') ####################################### # Write trimmomatic script ####################################### convert = ' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > ' qsub_script = general.open_write_file(out_dir + '/qsubs/qsub_trimmomatic.sh') qsub_script.write('#!/bin/bash\n') index = 0 args.adapter = fasta_o_matic.run_steps( args.adapter, ['wrap', 'new_line', 'header_whitespace']) for fastq in forwards: (f_path, f_basename, f_ext) = general.parse_filename(forwards[index]) qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 ' + out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh\n') if not args.single: (r_path, r_basename, r_ext) = general.parse_filename(reverses[index]) trim_script = general.open_write_file(out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh') trim_script.write('#!/bin/bash\n') # Convert headers if args.convert_header: trim_script.write('# Convert headers:\n') new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq' trim_script.write('cat ' + forwards[index] + convert + new_forward_fastq + '\n') forwards[index] = new_forward_fastq if not args.single: new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq' trim_script.write('cat ' + reverses[index] + convert + new_reverse_fastq + '\n') reverses[index] = new_reverse_fastq # Trim sequences trim_script.write('# Clean reads:\n') if not args.single: trim_script.write( trimmomatic_template.trim_template(forwards[index], reverses[index], args.adapter, out_dir)) else: trim_script.write( trimmomatic_template.trim_template_single(forwards[index])) # Section in progress... (Remember to point to a SE adapter fasta file # by default) trim_script.close() index += 1 qsub_script.close()