def main(name_space): import argparse from lib import File_IO import textwrap parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog = 'correct_fasta') parser.add_argument("-i", "--input", help="FASTA file need to be fixed") parser.add_argument("-o", "--output", help="Name of the output FASTA file") parser.add_argument('-head', default='>',help='Specify a head symbol if not >') args = parser.parse_args(name_space) file_origin = args.input if args.output: file_corrected = args.output else: file_corrected = 'corrected_'+file_origin head = args.head fasta_corrected = File_IO.read_fasta_multiline(File_IO.read_file(file_origin),head_symbol=head) count = File_IO.write_seqs(fasta_corrected,file_corrected,checker=False,overwrite=True) print 'Checked %d sequences in %s and saved in %s.' % (count, file_origin, file_corrected)
def main(name_space): import argparse import textwrap from lib import File_IO parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog = '-filter_database') parser.add_argument("-i", "--input", help="Name of the input FASTA file.") parser.add_argument("-o", "--output", help="Name of the output FASTA file") args = parser.parse_args(name_space) database = File_IO.read_seqs(args.input) count = len(database) print "Reading in %s ..." % args.input print "%s contains %i records." % (args.input, count) count_filter = 0 database_cleaned = [] for record in database: if record[0].find('unidentified') == -1: # check if current record contain 'unidentified' taxonomic level. database_cleaned.append(record) count_filter += 1 print "%i records contain 'unidentified' string." % (count - count_filter) count_write = File_IO.write_seqs(database_cleaned, args.output) print "Filtered database is saved in %s with %i records." % (args.output, count_write)
def main(Namespace): import argparse from lib import File_IO import sys import os import time import textwrap parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -merge_seqs') parser.add_argument('-i', '--input', help='Name of the input folder containing files to be merged') parser.add_argument('-o', '--output', help='Name of the merged file') group = parser.add_mutually_exclusive_group() group.add_argument('-fasta', action='store_true', help='Set the file type to FASTA') group.add_argument('-fastq', action='store_true', help='Set the file type to FASTQ, this is the default option') args = parser.parse_args(Namespace) input_folder = args.input if not input_folder: print('please specified an input folder') sys.exit() output_file = args.output if not output_file: print('Please specified an output file.') sys.exit() if os.path.isfile(output_file): file_size = round(os.path.getsize(output_file)/1024**2, 0) exist = raw_input('%s (%d MB)already exists , do you want to overwrite it? [y/n]' % (output_file, file_size)) if exist == 'y' or exist == 'Y': os.remove(output_file) else: print('Program stopped.') sys.exit() file_type = 'fastq' if args.fasta: file_type = 'fasta' start = time.time() f_list = File_IO.file_list(input_folder) f_list.sort() print('Found %i files in the folder %s' % (len(f_list), input_folder)) count = 0 n = 1 count_total = 0 for seq_file in f_list: current_file = input_folder+'/'+seq_file count = File_IO.write_seqs(File_IO.read_seqs(current_file, file_type), output_file, checker=False, overwrite=False) print('%d. Merged %d sequences from %s into the new file.' % (n, count, seq_file)) n += 1 count_total += count end = time.time() used_time = round(end-start, 2) print('Spent %s sec to merge %d records in %d files into %s' % (str(used_time), count_total, len(f_list), output_file))
def main(name_space): from lib import random_subsample as rs from lib import File_IO import argparse import textwrap import sys parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -random_subsample') parser.add_argument('-r1', help='Name of the Read1 file.') parser.add_argument('-r2', help='Name of the Read2 file if applicable.') parser.add_argument('-size', default = 10000, help='Sampling size for each file, default=10,000.') args = parser.parse_args(name_space) read1 = args.r1 if args.r2: read2 = args.r2 sample_size = int(args.size) read1_content = File_IO.read_seqs(read1) total_size = len(read1_content) file_type = "fasta" if read1_content[0][2] == "+": file_type = "fastq" if sample_size > total_size: print('The specified sampling size is larger than the total number of sequences.') sys.exit() else: seq_index = rs.generate_random_index(total_size, sample_size) # Get sequences in read1 file read1_picked = [] for index in seq_index: read1_picked.append(read1_content[index]) # Pick read1 file is the filename is specified if args.r2: read2_content = File_IO.read_seqs(read2) read2_picked = [] for index in seq_index: read2_picked.append(read2_content[index]) # write to new files read1_output = "R1."+file_type read1_count = File_IO.write_seqs(read1_picked, read1_output, checker=False, overwrite=True) print('{0} sequences have been randomly picked from {1}, and saved in {2}.'.format(read1_count, read1, read1_output)) if args.r2: read2_output = "R2."+file_type read2_count = File_IO.write_seqs(read2_picked, read2_output, checker=False, overwrite=True) print('{0} sequences have been randomly picked from {1}, and saved in {2}.'.format(read2_count, read2, read2_output))
def main(name_space): import argparse import textwrap from lib import File_IO import os import sys parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -add_labels') parser.add_argument( "-i", "--input", help="Name of the input file, merging from multiple sample sequences.") parser.add_argument("-o", "--output", help="Name of the output folder.") parser.add_argument("-r", "--read", choices=['r1', 'read1', 'r2', 'read2'], help="Read direction, read1 or read2.") args = parser.parse_args(name_space) #args = argparse.Namespace(input = 'read1.cut2.fastq', output = 'unmerged', read = 'read1') # This line is for testing purpose input_file = args.input output_folder = args.output read_type = args.read if read_type == 'r1' or read_type == 'read1': read_type = 'R1' elif read_type == 'r2' or read_type == 'read2': read_type = 'R2' else: print('Please specify the correct read type using the -r option.') sys.exit() os.makedirs(output_folder, exist_ok=True) input_seqs = File_IO.read_seqs(input_file) output_records = {} for record in input_seqs: sample_name = record[0][0:record[0].index('_')] try: output_records[sample_name].append(record) except KeyError: output_records[sample_name] = [record] for key, value in output_records.items(): output_file = output_folder + '/' + key + '_' + read_type + '.fastq' File_IO.write_seqs(value, output_file, checker=False, overwrite=True)
def main(name_space): import argparse import textwrap from lib import File_IO import time import sys parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------''')) parser.add_argument("-i", "--input", help="Convert a FASTQ file.") parser.add_argument("-o", "--output", help="Name of the output FASTA file") #parser.add_argument("-q", "--qual", action="store_true", help="Output Qual file") args = parser.parse_args(name_space) fasta_file = args.output #qual = args.qual if args.input: fastq_file = args.input start = time.time() print("Loading %s ..." % fastq_file) fasta_content = File_IO.read_seqs(fastq_file, file_type='fastq', output='fasta') print('Converting to FASTA ...') record_num = File_IO.write_seqs(fasta_content, fasta_file, checker=False, overwrite=True) print("Converted %d records in %s ..." % (record_num, fastq_file)) end = time.time() used_time = round(end - start, 2) print( "It took %s sec to convert (%s seqs/s).\nFASTA file saved in %s." % (str(used_time), str(round(record_num / used_time, 0)), fasta_file)) # if qual: # print "Quality scores saved in %s." % (File_IO.name_file(fasta_file, '', 'qual')) else: print("Please specify a FASTQ file.") sys.exit()
def main(name_space): from lib import random_subsample as rs from lib import File_IO import argparse import textwrap parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -random_subsample') parser.add_argument('-i', '--input', help='Name of the input folder with raw data') parser.add_argument('-o', '--output', default = 'random_dataset', help='Name of the output folder with raw data') parser.add_argument('-file_number', default = 10, help='Number of file to pick.') parser.add_argument('-size', default = 10000, help='Sampling size for each file.') args = parser.parse_args(name_space) input_folder = args.input output_folder = args.output file_number = int(args.file_number) sample_size = int(args.size) # Create new folder File_IO.mk_dir(output_folder) # Randomly pick files to be sampled input_file_list = File_IO.file_list(input_folder) print('Found {0} files in the folder {1}'.format(len(input_file_list), input_folder), end = '\n') file_index = rs.generate_random_index(len(input_file_list), file_number) file_list = [] for index in file_index: file_list.append(input_file_list[index]) # Randomly pick sequences from each file for raw_file in file_list: print('\tRandoming sampling {0} for {1} sequences ...'.format(raw_file, sample_size, end='\r')) current_content = File_IO.read_seqs(input_folder + '/' + raw_file) seq_index = rs.generate_random_index(len(current_content), sample_size) sampled_content = [] for index in seq_index: sampled_content.append(current_content[index]) count = File_IO.write_seqs(sampled_content, output_folder + '/' + raw_file) print('A randomly sampled dataset ({0} files, {1} sequences per file) was generated under the folder {2}'.format(file_number, sample_size, output_folder, end = '\n'))
def main(name_space): import argparse import textwrap parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -otu_deconstruct') parser.add_argument('-map', help='Name of the FAST-derep map.') parser.add_argument('-o', '--output', default='otu_deconstruct', help='Name of the output folder') args = parser.parse_args(name_space) input_map_file = args.map output_folder = args.output from lib import ParseOtuMap from lib import File_IO File_IO.mk_dir(output_folder) input_map = ParseOtuMap.read_fast_output(input_map_file) input_map = ParseOtuMap.fast_output_parser(input_map) input_map_size = input_map.unit_count print('{0} contains {1} OTUs.'.format(input_map_file, input_map_size)) otu_list = input_map.get_seqs() # get a list of otu with their sequences for unit in otu_list: output_file = output_folder + '/' + unit[0] + '.txt' current_otu = input_map.detail_sample_unit(unit[0]) print('\tWriting: {0} ...\r'.format(output_file, end='\r')) with open(output_file, 'wb') as f: for line in current_otu: line = '\t'.join([str(i) for i in line]) f.write('%s\n' % line) print('All files wrote to the folder: {0}.'.format(output_folder))
def ReLabelFastQ(file_name, label, read_type, input_folder, output_folder='labeled', file_type='fastq', label_type='qiime'): #%% Read in sequence file and change the header from lib import File_IO file_content = File_IO.read_seqs(input_folder + '/' + file_name, file_type=file_type) head_symbol = '@' if len(file_content[0]) == 2: head_symbol == '>' count = 0 for record in file_content: #Loop through header of the records record[0] = ChangeName(label, count, read_type, label_type=label_type) count += 1 file_labeled = output_folder + '/labeled_' + file_name with open(file_labeled, 'w') as f: for record in file_content: record[0] = head_symbol + record[ 0] # Add head symbol to sequence name for line in record: f.write('%s\n' % line) return count
def MainLabelFiles(mapping_file, input_folder, threads=1, output_folder='labeled', file_type='fastq', label_type='both'): #Create a new folder for relabeled files from lib import File_IO File_IO.mk_dir(output_folder) if threads == 1: print("Relabeling files using %d thread ..." % threads) mapping = ParseMapping(mapping_file, input_folder) file_num = len(mapping) for item in mapping: count = ReLabelFastQ(item['file'], item['label'], item['read_type'], item['input_folder'], \ output_folder=output_folder, file_type=file_type, label_type=label_type) print("%s sequences in %s relabeled to %s as %s file.\n" % (count, item['file'], item['label'], item['read_type'])) elif threads > 1: print("Relabeling files using %d threads ..." % threads) mapping_multithreads = SplitMapping(mapping_file, input_folder, output_folder=output_folder, file_type=file_type, label_type=label_type, processor=threads) file_num = sum([len(i) for i in mapping_multithreads]) worker = CreateWorker(mapping_multithreads, threads=threads) for item in worker: # Start workers item.start() for item in worker: # Wait until all workers finishes item.join() else: print("The number of threads cannot be negative.") import sys sys.exit() return file_num #%%
def main(name_space): import argparse import textwrap from lib import File_IO import os import sys parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -add_labels') parser.add_argument("-i", "--input", help="Name of the input file, merging from multiple sample sequences.") parser.add_argument("-o", "--output", help="Name of the output folder.") parser.add_argument("-r", "--read", choices = ['r1', 'read1', 'r2', 'read2'], help="Read direction, read1 or read2.") args = parser.parse_args(name_space) #args = argparse.Namespace(input = 'read1.cut2.fastq', output = 'unmerged', read = 'read1') # This line is for testing purpose input_file = args.input output_folder = args.output read_type = args.read if read_type == 'r1' or read_type == 'read1': read_type = 'R1' elif read_type == 'r2' or read_type == 'read2': read_type = 'R2' else: print('Please specify the correct read type using the -r option.') sys.exit() os.makedirs(output_folder, exist_ok = True) input_seqs = File_IO.read_seqs(input_file) output_records = {} for record in input_seqs: sample_name = record[0][0:record[0].index('_')] try: output_records[sample_name].append(record) except KeyError: output_records[sample_name] = [record] for key, value in output_records.items(): output_file = output_folder + '/' + key + '_' + read_type + '.fastq' File_IO.write_seqs(value, output_file, checker = False, overwrite = True)
def read_otu_map(filename): from lib import File_IO OtuMap = File_IO.read_file(filename) MapDict = {} for line in OtuMap: names = line.strip('\n').split('\t') MapDict[names[0]] = names[1:] return MapDict
def read_otu_map(filename): from lib import File_IO OtuMap = File_IO.read_file(filename) MapDict = {} for line in OtuMap: names = line.strip('\n').split('\t') MapDict[names[0]]=names[1:] return MapDict
def main(name_space): import argparse import textwrap from lib import ParseOtuMap from lib import File_IO parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -subset_fast_hybrid') parser.add_argument('-i', '--input', help='Input of FAST hybrid map.') parser.add_argument('-o', '--output', help='Output prefix for the derep map and sequence') group = parser.add_mutually_exclusive_group() group.add_argument('-otu_list', help='A list of OTU names seperated by ","') group.add_argument('-otu_file', help='A file contains a list of OTU names (no header)') args = parser.parse_args(name_space) print ('Subtracting a FAST hybrid map with provieded OTU names ...') input_file = args.input output_derep = args.output + '.txt' output_fasta = args.output + '.fasta' otu_list = [] if args.otu_list: otu_list = args.otu_list.split(',') elif args.otu_file: otu_list = [] with open(args.otu_file) as f: for line in f: otu_list.append(line) print ('Found {0} OTU names.'.format(len(otu_list))) print ('Reading in the FAST hybrid map: {0} ...'.format(input_file)) hybrid_map = ParseOtuMap.read_fast_output(input_file) fast_derep = {} for otu in otu_list: fast_derep.update(hybrid_map[otu]['sample']) ParseOtuMap.write_fast_output(fast_derep, output_derep) print ('A FAST derep map wrote to: {0}.'.format(output_derep)) derep_seq = [] for key, value in fast_derep.items(): current_seq = [] derep_size = sum(value['sample'].values()) seq_label = key + ';size=' + str(derep_size) current_seq = [derep_size, seq_label, value['seq']] derep_seq.append(current_seq) derep_seq.sort(reverse=True) derep_seq = [i[1:] for i in derep_seq] count = File_IO.write_seqs(derep_seq, output_fasta, checker=False) print ('A dereplicated FASTA file wrote to {0}, containing {1} sequences with size annotation.'.format(output_fasta, count)) print ('\n')
def main(Namespace): from lib import File_IO from lib import ParseOtuMap import argparse import textwrap import sys parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -add_seqs_size') parser.add_argument('-i', '--input', help='Input FASTA file') parser.add_argument('-map', help='Input OTU map file') parser.add_argument('-o', '--output', help='Output FASTA file') args = parser.parse_args(Namespace) input_fasta = args.input input_map = args.map output_fasta = args.output print 'Reading in OTU map ...' otu_map = ParseOtuMap.read_otu_map(input_map) print 'Reaidng in sequence file ...' fasta = File_IO.read_seqs(input_fasta) print 'Found %i OTUs in the map file, found %i sequences in the sequence file.' % ( len(otu_map), len(fasta)) count = 0 print 'Adding size annotation ...' for record in fasta: try: size = len(otu_map[record[0]]) if record[0][-1] == ';': record[0] += ('size=%i;' % size) else: record[0] += (';size=%i;' % size) record.append(size) count += 1 print 'Annotating %i sequence ...' % count + '\b' * 100, except KeyError: print "Can not find %s in the OTU map file." % record[0] sys.exit() print print 'Sorting the annotated sequences ...' fasta.sort(key=lambda x: x[-1], reverse=True) print 'Writing to a new FASTA file ...' with open(output_fasta, 'w') as f: for record in fasta: f.write('>%s\n' % record[0]) f.write('%s\n' % record[1]) print 'Sequences with size annotations saved in %s.' % output_fasta
def main(name_space): import argparse import textwrap parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -generate_fast_map') parser.add_argument('-map', help='Name of the Qiime style OTU/Derep map file') parser.add_argument( '-seq', help='Name of the sequence file corresponding to the Qiime map.') parser.add_argument('-o', '--output', help='Name of the output FAST map.') group = parser.add_mutually_exclusive_group() group.add_argument('-derep', action='store_true', help='Indicate the source is a dereplication map.') group.add_argument('-otu', action='store_true', help='Indicate the source is an OTU map.') parser.add_argument( '-separator', default=';', help='Set the separator for parsing the sequence label.') args = parser.parse_args(name_space) input_map_file = args.map input_seq_file = args.seq output_map_file = args.output separator = args.separator if args.derep: real_sample = True elif args.otu: real_sample = False from lib import ParseOtuMap from lib import File_IO input_map = ParseOtuMap.read_otu_map(input_map_file) input_seq = File_IO.read_seqs(input_seq_file) output_map = ParseOtuMap.generate_fast_output(input_map, input_seq, real_sample=real_sample, separator=separator) ParseOtuMap.write_fast_output(output_map, output_map_file) print('FAST style map file wrote to %s.' % output_map_file)
def main(name_space): import argparse import textwrap parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog = 'fast.py -otu_deconstruct') parser.add_argument('-map', help='Name of the FAST-derep map.') parser.add_argument('-o', '--output', default = 'otu_deconstruct', help='Name of the output folder') args = parser.parse_args(name_space) input_map_file = args.map output_folder = args.output from lib import ParseOtuMap from lib import File_IO File_IO.mk_dir(output_folder) input_map = ParseOtuMap.read_fast_output(input_map_file) input_map = ParseOtuMap.fast_output_parser(input_map) input_map_size = input_map.unit_count print('{0} contains {1} OTUs.'.format(input_map_file, input_map_size)) otu_list = input_map.get_seqs() # get a list of otu with their sequences for unit in otu_list: output_file = output_folder + '/' + unit[0] + '.txt' current_otu = input_map.detail_sample_unit(unit[0]) print('\tWriting: {0} ...\r'.format(output_file, end='\r')) with open(output_file, 'wb') as f: for line in current_otu: line = '\t'.join([str(i) for i in line]) f.write('%s\n' % line) print('All files wrote to the folder: {0}.'.format(output_folder))
def main(name_space): import argparse import textwrap from lib import File_IO import time import sys parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------''')) parser.add_argument("-i", "--input", help="Convert a FASTQ file.") parser.add_argument("-o", "--output", help="Name of the output FASTA file") #parser.add_argument("-q", "--qual", action="store_true", help="Output Qual file") args = parser.parse_args(name_space) fasta_file = args.output #qual = args.qual if args.input: fastq_file = args.input start = time.time() print("Loading %s ..." % fastq_file) fasta_content = File_IO.read_seqs(fastq_file, file_type='fastq', output='fasta') print('Converting to FASTA ...') record_num = File_IO.write_seqs(fasta_content, fasta_file, checker=False, overwrite=True) print("Converted %d records in %s ..." % (record_num, fastq_file)) end = time.time() used_time = round(end - start, 2) print("It took %s sec to convert (%s seqs/s).\nFASTA file saved in %s." % ( str(used_time), str(round(record_num/used_time,0)), fasta_file)) # if qual: # print "Quality scores saved in %s." % (File_IO.name_file(fasta_file, '', 'qual')) else: print("Please specify a FASTQ file.") sys.exit()
def MainLabelFiles(mapping_file, input_folder, threads=1, output_folder='labeled', file_type='fastq', label_type='both'): #Create a new folder for relabeled files from lib import File_IO File_IO.mk_dir(output_folder) if threads == 1: print("Relabeling files using %d thread ..." % threads) mapping = ParseMapping(mapping_file, input_folder) file_num = len(mapping) for item in mapping: count = ReLabelFastQ(item['file'], item['label'], item['read_type'], item['input_folder'], \ output_folder=output_folder, file_type=file_type, label_type=label_type) print("%s sequences in %s relabeled to %s as %s file.\n" % ( count, item['file'], item['label'], item['read_type'])) elif threads > 1: print("Relabeling files using %d threads ..." % threads) mapping_multithreads = SplitMapping(mapping_file, input_folder, output_folder=output_folder, file_type=file_type, label_type=label_type, processor=threads) file_num = sum([len(i) for i in mapping_multithreads]) worker = CreateWorker(mapping_multithreads, threads=threads) for item in worker: # Start workers item.start() for item in worker: # Wait until all workers finishes item.join() else: print("The number of threads cannot be negative.") import sys sys.exit() return file_num #%%
def main(name_space): import argparse import textwrap from lib import File_IO import sys parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog = 'fast.py -count_seqs') parser.add_argument("-i", "--input", help="Name of the input sequence file") group = parser.add_mutually_exclusive_group() group.add_argument("-a", "--fasta", action="store_true", help="Set file type to FASTA") group.add_argument("-q", "--fastq", action="store_true", help="Set file type to FASTQ") args = parser.parse_args(name_space) seq_file = args.input if args.fasta: head_symbol = '>' seq_type = 'fasta' if args.fastq: head_symbol = "@" seq_type = "fastq" else: with open(seq_file, 'rU') as f: header = f.read(1) if header == '>': head_symbol = ">" seq_type = "fasta" print "File type set as FASTA." elif header == '@': head_symbol = '@' seq_type = 'fastq' print "File type set as FASTQ." else: print '%s is not a valid header for FASTA or FASTQ file.' % header sys.exit() seq_content = File_IO.read_seqs(seq_file, file_type=seq_type) seq_count = len(seq_content) print "%i records found in %s." % (seq_count, seq_file)
def main(name_space): import argparse import textwrap parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -generate_fast_map') parser.add_argument('-map', help='Name of the Qiime style OTU/Derep map file') parser.add_argument('-seq', help='Name of the sequence file corresponding to the Qiime map.') parser.add_argument('-o', '--output', help='Name of the output FAST map.') group = parser.add_mutually_exclusive_group() group.add_argument('-derep', action='store_true', help='Indicate the source is a dereplication map.') group.add_argument('-otu', action='store_true', help='Indicate the source is an OTU map.') parser.add_argument('-separator', default = ';', help='Set the separator for parsing the sequence label.') args = parser.parse_args(name_space) input_map_file = args.map input_seq_file = args.seq output_map_file = args.output separator = args.separator if args.derep: real_sample = True elif args.otu: real_sample = False from lib import ParseOtuMap from lib import File_IO input_map = ParseOtuMap.read_otu_map(input_map_file) input_seq = File_IO.read_seqs(input_seq_file) output_map = ParseOtuMap.generate_fast_output(input_map, input_seq, real_sample = real_sample, separator=separator) ParseOtuMap.write_fast_output(output_map, output_map_file) print('FAST style map file wrote to %s.' %output_map_file)
def ReLabelFastQ(file_name, label, read_type, input_folder, output_folder='labeled', file_type='fastq', label_type='qiime'): #%% Read in sequence file and change the header from lib import File_IO file_content = File_IO.read_seqs(input_folder + '/' + file_name, file_type=file_type) head_symbol = '@' if len(file_content[0]) == 2: head_symbol == '>' count = 0 for record in file_content: #Loop through header of the records record[0] = ChangeName(label, count, read_type, label_type=label_type) count += 1 file_labeled = output_folder + '/labeled_' + file_name with open(file_labeled, 'w') as f: for record in file_content: record[0] = head_symbol + record[0] # Add head symbol to sequence name for line in record: f.write('%s\n' % line) return count
def main(name_space): import argparse import textwrap from lib import ParseOtuMap from lib import File_IO #import sys parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -make_otu_table') group = parser.add_mutually_exclusive_group() group.add_argument('-qiime_map', help='The Qiime style OTU map.') group.add_argument('-fast_map', help='The FAST hybrid OTU map.') parser.add_argument('-o', '--output', help='Output OTU table.') parser.add_argument( '-rep', help='Indicate to output a representative sequnce if using FAST method.' ) args = parser.parse_args(name_space) if args.qiime_map != None: input_file = args.qiime_map method = 'qiime' elif args.fast_map != None: input_file = args.fast_map method = 'fast' if args.rep != None: output_seq_file = args.rep output_file = args.output # Parse OTU map into OTU table dictionary # Use Qiime style if method == 'qiime': print('Reading the Qiime style OTU map: {0} ...'.format(input_file)) otu_map = ParseOtuMap.read_otu_map(input_file) sample_list = [] otu_table_dict = {} for key, value in otu_map.items(): otu_table_dict[key] = {} for sample in value: treatment = sample[:sample.find('_')] if treatment not in sample_list: sample_list.append(treatment) try: otu_table_dict[key][treatment] += 1 except KeyError: otu_table_dict[key][treatment] = 1 # Use FAST style if method == 'fast': print('Reading the FAST hybrid OTU map: {0} ...'.format(input_file)) otu_map = ParseOtuMap.read_fast_output(input_file) # sample_list = [] # otu_table_dict = {} # for otu, value in otu_map.items(): # otu_table_dict[otu] = {} # for derep_unit, derep_value in value['sample'].items(): # for sample, abundance in derep_value['sample'].items(): # treatment = sample # if treatment not in sample_list: # sample_list.append(treatment) # try: # otu_table_dict[otu][treatment] += abundance # except KeyError: # otu_table_dict[otu][treatment] = abundance otu_map_parser = ParseOtuMap.fast_output_parser(otu_map) sample_list, otu_table_dict = otu_map_parser.parse_otu_table() if args.rep != None: temp_content = otu_map_parser.get_seqs() rep_seq = [] for item in temp_content: rep_seq.append(item[:2]) rep_seq_count = File_IO.write_seqs(rep_seq, output_seq_file, checker=False, overwrite=True) print('{0} OTUs were wrote to {1}.'.format(rep_seq_count, output_seq_file)) # Convert OTU table dictionary to table # otu_abundance = {} # for sample in sample_list: # otu_abundance[sample] = 0 # Set initial abundance to zero as place holder sample_list.sort() otu_table = [] for key, value in otu_table_dict.items(): current_otu = [key] for sample in sample_list: try: current_otu.append(value[sample]) except KeyError: current_otu.append(0) otu_table.append(current_otu) otu_table.sort(key=lambda x: sum(map(int, x[1:])), reverse=True) otu_table = [['OTU_ID'] + sample_list] + otu_table # Write OTU table to a new file sample_list = ['OTU_ID'] + sample_list with open(output_file, 'w') as f: for line in otu_table: line = [str(i) for i in line] f.write('%s\n' % '\t'.join(line)) print('OTU table with {0} samples was saved in {1}.'.format( len(sample_list) - 1, output_file))
def main(name_space): import argparse import textwrap from lib import ParseOtuMap from lib import File_IO #import sys parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -make_otu_table') group = parser.add_mutually_exclusive_group() group.add_argument('-qiime_map', help='The Qiime style OTU map.') group.add_argument('-fast_map', help='The FAST hybrid OTU map.') parser.add_argument('-o', '--output', help='Output OTU table.') parser.add_argument('-rep', help='Indicate to output a representative sequnce if using FAST method.') args = parser.parse_args(name_space) if args.qiime_map != None: input_file = args.qiime_map method = 'qiime' elif args.fast_map != None: input_file = args.fast_map method = 'fast' if args.rep != None: output_seq_file = args.rep output_file = args.output # Parse OTU map into OTU table dictionary # Use Qiime style if method == 'qiime': print('Reading the Qiime style OTU map: {0} ...'.format(input_file)) otu_map = ParseOtuMap.read_otu_map(input_file) sample_list = [] otu_table_dict = {} for key, value in otu_map.items(): otu_table_dict[key] = {} for sample in value: treatment = sample[:sample.find('_')] if treatment not in sample_list: sample_list.append(treatment) try: otu_table_dict[key][treatment] += 1 except KeyError: otu_table_dict[key][treatment] = 1 # Use FAST style if method == 'fast': print('Reading the FAST hybrid OTU map: {0} ...'.format(input_file)) otu_map = ParseOtuMap.read_fast_output(input_file) # sample_list = [] # otu_table_dict = {} # for otu, value in otu_map.items(): # otu_table_dict[otu] = {} # for derep_unit, derep_value in value['sample'].items(): # for sample, abundance in derep_value['sample'].items(): # treatment = sample # if treatment not in sample_list: # sample_list.append(treatment) # try: # otu_table_dict[otu][treatment] += abundance # except KeyError: # otu_table_dict[otu][treatment] = abundance otu_map_parser = ParseOtuMap.fast_output_parser(otu_map) sample_list, otu_table_dict = otu_map_parser.parse_otu_table() if args.rep != None: temp_content = otu_map_parser.get_seqs() rep_seq = [] for item in temp_content: rep_seq.append(item[:2]) rep_seq_count = File_IO.write_seqs(rep_seq, output_seq_file, checker=False, overwrite=True) print('{0} OTUs were wrote to {1}.'.format(rep_seq_count, output_seq_file)) # Convert OTU table dictionary to table # otu_abundance = {} # for sample in sample_list: # otu_abundance[sample] = 0 # Set initial abundance to zero as place holder sample_list.sort() otu_table = [] for key, value in otu_table_dict.items(): current_otu = [key] for sample in sample_list: try: current_otu.append(value[sample]) except KeyError: current_otu.append(0) otu_table.append(current_otu) otu_table.sort(key=lambda x: sum(map(int, x[1:])), reverse=True) otu_table = [['OTU_ID'] + sample_list] + otu_table # Write OTU table to a new file sample_list = ['OTU_ID'] + sample_list with open(output_file, 'w') as f: for line in otu_table: line = [str(i) for i in line] f.write('%s\n' % '\t'.join(line)) print('OTU table with {0} samples was saved in {1}.'.format(len(sample_list)-1, output_file))
def main(name_space): import argparse import textwrap from lib import random_subsample as rs from lib import ParseOtuMap from lib import Seq_IO from lib import File_IO parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -pick_seqs') parser.add_argument('-i', '--input', help='Input FASTA file to be picked') parser.add_argument('-o', '--output', help='Name for output FASTA file.') group = parser.add_mutually_exclusive_group() group.add_argument('-map', help='OTU map file, will pick OTU names in default') group.add_argument('-name_list', help='File with names in separated lines') group.add_argument('-random_pick', help='Randomly pick the given number of sequences.') parser.add_argument('-sequence', action='store_true', help='Indicate to pick by sequence names instead of OTU names') parser.add_argument('-sizeout', action='store_true', help='Indicate to output size label.') args = parser.parse_args(name_space) input_fasta = args.input output_fasta = args.output pick_list = False print('\n') if args.map: pick_list = [] if not args.sequence: otu_map = ParseOtuMap.read_otu_map(args.map) for key in otu_map: pick_list.append(key) elif args.sequence: otu_map = ParseOtuMap.read_otu_map(args.map) for key, value in otu_map.items(): pick_list += value print('Picking sequences from the OTU map: %s.' % (args.map)) print('Found %i names to be picked.' % len(pick_list)) if args.name_list: pick_list = [] with open(args.name_list, 'rU') as f: for line in f: pick_list.append(line.strip('\n')) print('Picking sequences from a OTU list.') print('Found %i names to be picked.' % len(pick_list)) if args.random_pick: pick_size = int(args.random_pick) print('Randomly pick %i sequences.' % (pick_size)) if pick_list == []: input_content = File_IO.read_seqs(input_fasta) print('Reaing in the original FASTA file: %s ...' % input_fasta) print('Randomly sampling %i sequences out of %i ...' %(pick_size, len(input_content))) seq_index = rs.generate_random_index(len(input_content), pick_size) sampled_content = [] for index in seq_index: sampled_content.append(input_content[index]) count = File_IO.write_seqs(sampled_content, output_fasta, checker=False, overwrite=True) print('Picked sequences wrote to %s.' % output_fasta) else: print('Reaing in the original FASTA file: %s ...' % input_fasta) input_content = File_IO.read_seqs(input_fasta) for record in input_content: record[0] = record[0].split(' ')[0] # OTU name will be cut at the first space if record[0].find(';') != -1: record[0] = record[0][:record[0].find(';')] # Cut the label at the first ";" print('Indexing the original sequence file ...') input_dict = Seq_IO.make_dict(input_content) count_picked = 0 count_missed = 0 print('Search name list in the sequence file ...') picked_content = [] if args.sizeout: print("Output size labels ...") size_list = [] for record in pick_list: size_list.append([record, len(otu_map[record])]) size_list = sorted(size_list, key=lambda x:x[1], reverse=True) for record in size_list: try: new_label = record[0] + ';size=' + str(record[1]) picked_content.append([new_label, input_dict[record[0]][0]]) count_picked += 1 except KeyError: count_missed += 1 else: for name in pick_list: try: picked_content.append([name, input_dict[name][0]]) count_picked += 1 except KeyError: count_missed += 1 print('Finished searching.') print('Original sequence=%i' % len(input_content)) print('Input names=%i' % len(pick_list)) print('Picked sequences=%i' % count_picked) print('Not found sequences=%i' % count_missed) print('Writing to a new FASTA file ...') count = File_IO.write_seqs(picked_content, output_fasta, checker=False, overwrite=True) print('Picked sequences wrote to %s.' % output_fasta)
def main(name_space): from lib import random_subsample as rs from lib import ParseOtuTable from lib import File_IO import argparse import textwrap import time parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -rarefy_otu_table') parser.add_argument('-otu', help='Input OTU table') parser.add_argument('-o', '--output', help='Output OTU table') parser.add_argument('-d', '--depth', help='Sampling depth for each sample') parser.add_argument('-iter', default=1, help='Iteration time for each sample') parser.add_argument('-thread', default=1, help='Number of threads') parser.add_argument('-keep_all', action='store_true', help='Indicate to keep all samples') parser.add_argument('-meta_column', default='taxonomy', help='Name of the first meta data') args = parser.parse_args(name_space) input_otu = args.otu iter_num = int(args.iter) thread = int(args.thread) meta_col = args.meta_column if args.output: output_otu = args.output else: output_otu = File_IO.name_file(input_otu, '', 'rare') otu_table = ParseOtuTable.parser_otu_table(input_otu, meta_col=meta_col) input_sample = otu_table.sample_matrix start = time.time() print('Input OTU table: %s' % input_otu) if args.depth: depth = int(args.depth) print('Sampling depth: %i' % depth) else: depth = min([sum(i[1:]) for i in input_sample]) print('Sampling depth set to the minimum abundance: %i' % depth) print('Iteration time for each OTU: %i' % iter_num) print('Threads number: %i' % thread) print('Reading in the OTU table ...') if args.keep_all: count = 0 for line in input_sample: if sum(line[1:]) < depth: count += 1 print('Found %i samples in the OTU table.' % len(input_sample)) print( '%i samples has total abundance less than the sampling depth, but will be kept in the output.' % count) else: temp = [] count = 0 for line in input_sample: if sum(line[1:]) >= depth: temp.append(line) else: count += 1 input_sample = temp print('Found %i samples in the OTU table.' % len(input_sample)) print( '%i samples has total abundance less than the sampling depth, and will be excluded.' % (count)) otu_id = otu_table.species_id otu_table_rarefied = [['OTU_ID'] + otu_id + otu_table.meta_id] for sample in input_sample: print('Rarefying %s ...' % sample[0]) repeat_sample = rs.repeat_rarefaction_parallel(sample[1:], depth, iter_num, processor=thread) repeat_sample.sort(key=lambda x: sum(i > 0 for i in x)) repeat_sample = [sample[0]] + repeat_sample[int( iter_num / 2)] # Pick the rarefied sample with the average richness otu_table_rarefied.append(repeat_sample[:]) otu_table_rarefied = [list(i) for i in zip(*otu_table_rarefied)] # Add meta data meta_data = otu_table.meta_dict() for line in otu_table_rarefied[1:]: for key in otu_table.meta_id: line.append(meta_data[key][line[0]]) for key in otu_table.meta_id: otu_table_rarefied[0].append(key) with open(output_otu, 'w') as f: for line in otu_table_rarefied: line = [str(i) for i in line] f.write('%s\n' % '\t'.join(line)) print('Rarefied OTU table saved in %s.' % output_otu) end = time.time() used_time = round(float(end - start), 2) time_per_sample = round(used_time / len(input_sample), 2) print('Total time used: %s seconds (%s seconds per sample)' % (str(used_time), str(time_per_sample)))
def main(Namespace): import argparse import textwrap parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -dereplicate') parser.add_argument('-i', '--input', help='Input FASTA file to be dereplicated.') parser.add_argument('-o', '--output', help='Name for output OTU map and FASTA file.') parser.add_argument('-t', '--thread', default=1, help='Number of threads to be used.') parser.add_argument('-fast', default="", help="Name of FAST style output file.") parser.add_argument( '-sizeout', action='store_true', help='Specify to add a USEARCH style size label: ";szie=XXX"') args = parser.parse_args(Namespace) input_file = args.input output_name = args.output output_map = output_name + '.txt' output_fasta = output_name + '.fasta' thread = int(args.thread) import time from lib import File_IO from multiprocessing import Process, Manager import sys print('Using %i threads ...' % thread) start = time.time() input_file = input_file print('Loading %s ...' % input_file) seqs = File_IO.read_seqs(input_file) seqs_num = len(seqs) print('Read in %i sequences.' % seqs_num) # Disable multiprocess if using single thread if thread == 1: derep_dict = dereplicate_single_thread(seqs) else: # Separated seqs into pools print('Separating raw sequences into %d jobs ...' % thread) d = divide_seqs(seqs_num, thread) # Create shared list for store dereplicated dict and progress counter manager = Manager() derep_dict = manager.list([{}] * thread) count = manager.list([0] * thread) print('Starting dereplicating ...') workers = [] for i in range(thread): current_range = d[i] workers.append( Process(target=dereplicate_worker, args=(seqs[current_range[0]:current_range[1]], derep_dict, i, count))) del seqs print('Starting %i jobs ...' % thread) count_worker = 1 for job in workers: job.start() print('Starting thread No. %i ...' % count_worker) count_worker += 1 job_alive = True while job_alive: time.sleep(0.01) job_alive = False for job in workers: if job.is_alive(): job_alive = True #progress = "Dereplicating: " + str(round(sum(count)/float(seqs_num)*100,2)) + "%" + "\r" #sys.stderr.write(progress) for derep_worker in workers: derep_worker.join() print('Finished dereplicating.') seqs = [] # Empty sequences list to free memory. # Merged dereplicated dictionaries into a single dict sys.stderr.write('\n') if thread > 1: sys.stderr.write('Merging %i dictionaries into one ...' % len(derep_dict)) merged_dict = {} count = 0 for d in derep_dict: for key, value in d.items(): count += 1 try: merged_dict[key] += value except KeyError: merged_dict[key] = value #sys.stderr.write('Merging %i sequence ...' % count + '\b' * 50,) derep_dict[0] = '' # Empty finished dictionary to free memory. else: merged_dict = derep_dict print print("Sequences dereplicated, clasped from %i into %i sequences." % (seqs_num, len(merged_dict))) s = [len(merged_dict[i]) for i in merged_dict] print('Dereplicated OTU size: Max=%i, Min=%i, Average=%i.' % (max(s), min(s), round(float(sum(s) / len(s)), 2))) end = time.time() print("Used time: " + str(end - start) + ' seconds.') print # Name the dereplicated group size_list = sorted([[len(merged_dict[i]), i] for i in merged_dict], reverse=True) count = 0 for element in size_list: derep_name = 'derep_' + str(count) element.append(derep_name) count += 1 # Output dereplicated FASTA file print('Writing dereplicated sequence and OTU map ...') output_seq_file = output_fasta with open(output_seq_file, 'w') as f: if args.sizeout: for element in size_list: output_label = element[2] + ";size=" + str(element[0]) f.write('>%s\n' % output_label) f.write('%s\n' % element[1]) else: for element in size_list: output_label = element[2] f.write('>%s\n' % output_label) f.write('%s\n' % element[1]) print('%s contains dereplicated sequences.' % output_fasta) # Output Qiime style map with open(output_map, 'w') as f: for element in size_list: name_list = merged_dict[element[1]] f.write( '%s\t%s\n' % (element[2], '\t'.join(name_list))) # Use the last element as group name print('%s contains an OTU map for dereplicated sequences.' % output_map) # Generate FAST style derep output file (a single file with sample names, counts, and dereplicated sequences) if args.fast != "": fast_file = args.fast fast_dict = {} for element in size_list: fast_dict[element[2]] = { } # Crearte a new dict for current derep unit fast_dict[element[2]]['seq'] = element[ 1] # Save dereplicated sequence sample_dict = {} # Create a dict for sample sequence count name_list = merged_dict[element[1]] for sample in name_list: current_sample = get_treatment(sample) try: sample_dict[current_sample] += 1 except KeyError: sample_dict[current_sample] = 1 fast_dict[element[2]]['sample'] = sample_dict import json json.dump(fast_dict, open(fast_file, "wb"))
def main(Namespace): from lib import File_IO from lib import Seq_IO import argparse import textwrap import sys import time parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -filter_seqs') parser.add_argument('-i', '--input', help='Name of the input file, can be FASTA or FASTQ') parser.add_argument('-o', '--output', help='Name of the output file') parser.add_argument('-maxN', help='Number of maximum ambiguous base') parser.add_argument('-maxhomop', help='Maximum length of homopolyer') args = parser.parse_args(Namespace) start = time.time() seq_file = args.input filtered_file = args.output print('Reading in %s ...' % seq_file) seqs = File_IO.read_seqs(seq_file) count_total = len(seqs) print('Found %d sequences.' % count_total) if len(seqs[0]) == 2: seqs_type = 'fasta' elif len(seqs[0]) == 4: seqs_type = 'fastq' else: print( 'This is not a corerct FASTA or FASTQ file, please check you file.' ) sys.exit() # checkN = False check_homop = False if args.maxN: maxN = int(args.maxN) checkN = True print('Maximum ambiguous base allowed: %d' % maxN) if args.maxhomop: maxhomop = int(args.maxhomop) check_homop = True print('Maximum length of homopolyer: %d' % maxhomop) else: pass checker = 0 if checkN and check_homop: checker = 12 elif checkN: checker = 1 elif check_homop: checker = 2 seqs_filtered = [] count_pass = 0 count_total = 0 if checker == 12: for record in seqs: count_total += 1 #sys.stderr.write('Processing %i sequence ...' % count_total + '\b' * 100,) current_record = record[1] if not Seq_IO.check_ambiguous(current_record, maxN): if not Seq_IO.check_homop(current_record, maxhomop + 1): seqs_filtered.append(record) count_pass += 1 if checker == 1: for record in seqs: count_total += 1 #sys.stderr.write('Processing %i sequence ...' % count_total + '\b' * 100,) current_record = record[1] if not Seq_IO.check_ambiguous(current_record, maxN): seqs_filtered.append(record) count_pass += 1 if checker == 2: for record in seqs: count_total += 1 #sys.stderr.write('Processing %i sequence ...' % count_total + '\b' * 100,) current_record = record[1] if not Seq_IO.check_homop(current_record, maxhomop + 1): seqs_filtered.append(record) count_pass += 1 end = time.time() used_time = round(float(end - start), 2) print print( 'Filtered %d sequences, %d (%s%%) passed. Used %s seconds.' % (count_total, count_pass, str(round(float(count_pass) / count_total, 1) * 100), str(used_time))) print('Writing to %s ...' % filtered_file) count = File_IO.write_seqs(seqs_filtered, filtered_file, checker=False, overwrite=True) print('Filtered sequences (%i seqs) store in %s' % (count, filtered_file))
seqs_divide.append(size) seqs_divide[0] += total % thread_num return seqs_divide if __name__ == '__main__': import time from lib import File_IO from multiprocessing import Process, Manager import os, sys print 'Using %i threads ...' % thread input_file = input_file print 'Loading %s ...' % input_file seqs = File_IO.read_seqs(input_file) seqs_num = len(seqs) print 'Read in %i sequences.' % seqs_num # Separated seqs into pools print 'Separating raw sequences into %d jobs ...' % thread d = divide_seqs(seqs_num, thread) start = time.time() # Create shared list for store dereplicated dict and progress counter manager = Manager() count = manager.list([0] * thread) print 'Starting dereplicating ...' workers = [] for i in range(thread):
def main(name_space): from lib import random_subsample as rs from lib import File_IO import argparse import textwrap import sys parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -random_subsample') parser.add_argument('-r1', help='Name of the Read1 file.') parser.add_argument('-r2', help='Name of the Read2 file if applicable.') parser.add_argument('-size', default=10000, help='Sampling size for each file, default=10,000.') args = parser.parse_args(name_space) read1 = args.r1 if args.r2: read2 = args.r2 sample_size = int(args.size) read1_content = File_IO.read_seqs(read1) total_size = len(read1_content) file_type = "fasta" if read1_content[0][2] == "+": file_type = "fastq" if sample_size > total_size: print( 'The specified sampling size is larger than the total number of sequences.' ) sys.exit() else: seq_index = rs.generate_random_index(total_size, sample_size) # Get sequences in read1 file read1_picked = [] for index in seq_index: read1_picked.append(read1_content[index]) # Pick read1 file is the filename is specified if args.r2: read2_content = File_IO.read_seqs(read2) read2_picked = [] for index in seq_index: read2_picked.append(read2_content[index]) # write to new files read1_output = "R1." + file_type read1_count = File_IO.write_seqs(read1_picked, read1_output, checker=False, overwrite=True) print( '{0} sequences have been randomly picked from {1}, and saved in {2}.'. format(read1_count, read1, read1_output)) if args.r2: read2_output = "R2." + file_type read2_count = File_IO.write_seqs(read2_picked, read2_output, checker=False, overwrite=True) print( '{0} sequences have been randomly picked from {1}, and saved in {2}.' .format(read2_count, read2, read2_output))
def main(name_space): import argparse import textwrap from lib import File_IO parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -truncate_seqs') parser.add_argument("-i", "--input", help="Name of the input FASTA file.") group = parser.add_mutually_exclusive_group() group.add_argument('-fixed_length', help='A fixed length to cut on all sequences.') group.add_argument( '-slice', help= 'Slice size to cut from head and tail of each sequence in the format of "head,tail".' ) parser.add_argument('-sliced_out', action='store_true', help='Indicate to output sliced sequences.') parser.add_argument("-o", "--output", help="Name of the output file.") args = parser.parse_args(name_space) if args.fixed_length: truncate_length = int(args.fixed_length) sequences = File_IO.read_seqs(args.input) count = len(sequences) print("Reading in %s ..." % args.input) print("%s contains %i records." % (args.input, count)) print("Cutting sequences to a fixed length: %i ..." % truncate_length) count_fail = 0 with open(args.output, 'w') as f: for record in sequences: if len(record[1]) >= truncate_length: if len(record) == 2: f.write('>%s\n' % record[0]) f.write('%s\n' % record[1][:truncate_length]) elif len(record) == 4: f.write('@%s\n' % record[0]) f.write('%s\n' % record[1][:truncate_length]) f.write('%s\n' % record[2]) f.write('%s\n' % record[3][:truncate_length]) else: count_fail += 1 print("%i sequences were cut to %i and save in %s." % (count - count_fail, truncate_length, args.output)) if args.slice: slice_window = args.slice.split(',') head = int(slice_window[0]) tail = int(slice_window[1]) sequences = File_IO.read_seqs(args.input) count = len(sequences) print("Reading in %s ..." % args.input) print("%s contains %i records." % (args.input, count)) print("Slicing %i bp from the head and %i bp from the tail ..." % (head, tail)) count_fail = 0 with open(args.output, 'w') as f: for record in sequences: seq_len = len(record[1]) if seq_len > head + tail: if len(record) == 2: f.write('>%s\n' % record[0]) f.write('%s\n' % record[1][head:(seq_len - tail)]) elif len(record) == 4: f.write('@%s\n' % record[0]) f.write('%s\n' % record[1][head:(seq_len - tail)]) f.write('%s\n' % record[2]) f.write('%s\n' % record[3][head:(seq_len - tail)]) else: count_fail += 1 print("%i sequences were sliced and save in %s." % (count - count_fail, args.output)) if args.sliced_out: if head > 0: head_output = 'head.' + args.output with open(head_output, 'wb') as f: for record in sequences: if len(record) == 2: f.write('>%s\n' % record[0]) f.write('%s\n' % record[1][:head]) elif len(record) == 4: f.write('@%s\n' % record[0]) f.write('%s\n' % record[1][:head]) f.write('%s\n' % record[2]) f.write('%s\n' % record[3][:head]) print('The sliced head sequences wrote to %s.' % (head_output)) if tail > 0: tail_output = 'tail.' + args.output with open(tail_output, 'wb') as f: for record in sequences: seq_len = len(record[1]) if len(record) == 2: f.write('>%s\n' % record[0]) f.write('%s\n' % record[1][(seq_len - tail):]) elif len(record) == 4: f.write('@%s\n' % record[0]) f.write('%s\n' % record[1][(seq_len - tail):]) f.write('%s\n' % record[2]) f.write('%s\n' % record[3][(seq_len - tail):]) print('The sliced tail sequences wrote to %s.' % (tail_output))
def main(Namespace): import argparse import textwrap parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -dereplicate') parser.add_argument('-i', '--input', help='Input FASTA file to be dereplicated.') parser.add_argument('-o', '--output', help='Name for output OTU map and FASTA file.') parser.add_argument('-t', '--thread', default = 1, help='Number of threads to be used.') parser.add_argument('-fast', default = "", help="Name of FAST style output file.") parser.add_argument('-sizeout', action = 'store_true', help='Specify to add a USEARCH style size label: ";szie=XXX"') args = parser.parse_args(Namespace) input_file = args.input output_name = args.output output_map = output_name + '.txt' output_fasta = output_name + '.fasta' thread = int(args.thread) import time from lib import File_IO from multiprocessing import Process, Manager import sys print('Using %i threads ...' % thread) start = time.time() input_file = input_file print('Loading %s ...' % input_file) seqs = File_IO.read_seqs(input_file) seqs_num = len(seqs) print('Read in %i sequences.' % seqs_num) # Disable multiprocess if using single thread if thread == 1: derep_dict = dereplicate_single_thread(seqs) else: # Separated seqs into pools print('Separating raw sequences into %d jobs ...' % thread) d = divide_seqs(seqs_num, thread) # Create shared list for store dereplicated dict and progress counter manager = Manager() derep_dict = manager.list([{}] * thread) count = manager.list([0] * thread) print('Starting dereplicating ...') workers = [] for i in range(thread): current_range = d[i] workers.append(Process(target=dereplicate_worker, args=(seqs[current_range[0]:current_range[1]], derep_dict, i, count))) del seqs print('Starting %i jobs ...' % thread) count_worker = 1 for job in workers: job.start() print('Starting thread No. %i ...' % count_worker) count_worker += 1 job_alive = True while job_alive: time.sleep(0.01) job_alive = False for job in workers: if job.is_alive(): job_alive = True #progress = "Dereplicating: " + str(round(sum(count)/float(seqs_num)*100,2)) + "%" + "\r" #sys.stderr.write(progress) for derep_worker in workers: derep_worker.join() print('Finished dereplicating.') seqs = [] # Empty sequences list to free memory. # Merged dereplicated dictionaries into a single dict sys.stderr.write('\n') if thread > 1: sys.stderr.write('Merging %i dictionaries into one ...' % len(derep_dict)) merged_dict = {} count = 0 for d in derep_dict: for key, value in d.items(): count += 1 try: merged_dict[key] += value except KeyError: merged_dict[key] = value #sys.stderr.write('Merging %i sequence ...' % count + '\b' * 50,) derep_dict[0] = '' # Empty finished dictionary to free memory. else: merged_dict = derep_dict print print("Sequences dereplicated, clasped from %i into %i sequences." % (seqs_num, len(merged_dict))) s = [len(merged_dict[i]) for i in merged_dict] print('Dereplicated OTU size: Max=%i, Min=%i, Average=%i.' % (max(s), min(s), round(float(sum(s) / len(s)), 2))) end = time.time() print("Used time: " + str(end - start) + ' seconds.') print # Name the dereplicated group size_list = sorted([[len(merged_dict[i]), i] for i in merged_dict], reverse=True) count = 0 for element in size_list: derep_name = 'derep_' + str(count) element.append(derep_name) count += 1 # Output dereplicated FASTA file print('Writing dereplicated sequence and OTU map ...') output_seq_file = output_fasta with open(output_seq_file, 'w') as f: if args.sizeout: for element in size_list: output_label = element[2] + ";size=" + str(element[0]) f.write('>%s\n' % output_label) f.write('%s\n' % element[1]) else: for element in size_list: output_label = element[2] f.write('>%s\n' % output_label) f.write('%s\n' % element[1]) print('%s contains dereplicated sequences.' % output_fasta) # Output Qiime style map with open(output_map, 'w') as f: for element in size_list: name_list = merged_dict[element[1]] f.write('%s\t%s\n' % (element[2], '\t'.join(name_list))) # Use the last element as group name print('%s contains an OTU map for dereplicated sequences.' % output_map) # Generate FAST style derep output file (a single file with sample names, counts, and dereplicated sequences) if args.fast != "": fast_file = args.fast fast_dict = {} for element in size_list: fast_dict[element[2]] = {} # Crearte a new dict for current derep unit fast_dict[element[2]]['seq'] = element[1] # Save dereplicated sequence sample_dict = {} # Create a dict for sample sequence count name_list = merged_dict[element[1]] for sample in name_list: current_sample = get_treatment(sample) try: sample_dict[current_sample] += 1 except KeyError: sample_dict[current_sample] = 1 fast_dict[element[2]]['sample'] = sample_dict import json json.dump(fast_dict, open(fast_file, "wb"))
def main(Namespace): from lib import File_IO from lib import Seq_IO import argparse import textwrap import sys import time parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -filter_seqs') parser.add_argument('-i', '--input', help='Name of the input file, can be FASTA or FASTQ') parser.add_argument('-o', '--output', help='Name of the output file') parser.add_argument('-maxN', help='Number of maximum ambiguous base') parser.add_argument('-maxhomop', help='Maximum length of homopolyer') args = parser.parse_args(Namespace) start = time.time() seq_file = args.input filtered_file = args.output print('Reading in %s ...' % seq_file) seqs = File_IO.read_seqs(seq_file) count_total = len(seqs) print('Found %d sequences.' % count_total) if len(seqs[0]) == 2: seqs_type = 'fasta' elif len(seqs[0]) == 4: seqs_type = 'fastq' else: print('This is not a corerct FASTA or FASTQ file, please check you file.') sys.exit() # checkN = False check_homop = False if args.maxN: maxN = int(args.maxN) checkN = True print('Maximum ambiguous base allowed: %d' % maxN) if args.maxhomop: maxhomop = int(args.maxhomop) check_homop = True print('Maximum length of homopolyer: %d' % maxhomop) else: pass checker = 0 if checkN and check_homop: checker = 12 elif checkN: checker = 1 elif check_homop: checker = 2 seqs_filtered = [] count_pass = 0 count_total = 0 if checker == 12: for record in seqs: count_total += 1 #sys.stderr.write('Processing %i sequence ...' % count_total + '\b' * 100,) current_record = record[1] if not Seq_IO.check_ambiguous(current_record, maxN): if not Seq_IO.check_homop(current_record, maxhomop + 1): seqs_filtered.append(record) count_pass += 1 if checker == 1: for record in seqs: count_total += 1 #sys.stderr.write('Processing %i sequence ...' % count_total + '\b' * 100,) current_record = record[1] if not Seq_IO.check_ambiguous(current_record, maxN): seqs_filtered.append(record) count_pass += 1 if checker == 2: for record in seqs: count_total += 1 #sys.stderr.write('Processing %i sequence ...' % count_total + '\b' * 100,) current_record = record[1] if not Seq_IO.check_homop(current_record, maxhomop + 1): seqs_filtered.append(record) count_pass += 1 end = time.time() used_time = round(float(end - start), 2) print print ('Filtered %d sequences, %d (%s%%) passed. Used %s seconds.' % ( count_total, count_pass, str(round(float(count_pass) / count_total, 1) * 100), str(used_time))) print('Writing to %s ...' % filtered_file) count = File_IO.write_seqs(seqs_filtered, filtered_file, checker=False, overwrite=True) print('Filtered sequences (%i seqs) store in %s' % (count, filtered_file))
def main(name_space): import argparse import textwrap from lib import File_IO parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog = 'fast.py -truncate_seqs') parser.add_argument("-i", "--input", help="Name of the input FASTA file.") group = parser.add_mutually_exclusive_group() group.add_argument('-fixed_length', help='A fixed length to cut on all sequences.') group.add_argument('-slice', help='Slice size to cut from head and tail of each sequence in the format of "head,tail".') parser.add_argument('-sliced_out', action='store_true', help='Indicate to output sliced sequences.') parser.add_argument("-o", "--output", help="Name of the output file.") args = parser.parse_args(name_space) if args.fixed_length: truncate_length = int(args.fixed_length) sequences = File_IO.read_seqs(args.input) count = len(sequences) print("Reading in %s ..." % args.input) print("%s contains %i records." % (args.input, count)) print("Cutting sequences to a fixed length: %i ..." % truncate_length) count_fail = 0 with open(args.output, 'w') as f: for record in sequences: if len(record[1]) >= truncate_length: if len(record) == 2: f.write('>%s\n' % record[0]) f.write('%s\n' % record[1][:truncate_length]) elif len(record) == 4: f.write('@%s\n' % record[0]) f.write('%s\n' % record[1][:truncate_length]) f.write('%s\n' % record[2]) f.write('%s\n' % record[3][:truncate_length]) else: count_fail += 1 print("%i sequences were cut to %i and save in %s." % (count - count_fail, truncate_length, args.output)) if args.slice: slice_window = args.slice.split(',') head = int(slice_window[0]) tail = int(slice_window[1]) sequences = File_IO.read_seqs(args.input) count = len(sequences) print("Reading in %s ..." % args.input) print("%s contains %i records." % (args.input, count)) print("Slicing %i bp from the head and %i bp from the tail ..." % (head, tail)) count_fail = 0 with open(args.output, 'w') as f: for record in sequences: seq_len = len(record[1]) if seq_len > head + tail: if len(record) == 2: f.write('>%s\n' % record[0]) f.write('%s\n' % record[1][head:(seq_len - tail)]) elif len(record) == 4: f.write('@%s\n' % record[0]) f.write('%s\n' % record[1][head:(seq_len - tail)]) f.write('%s\n' % record[2]) f.write('%s\n' % record[3][head:(seq_len - tail)]) else: count_fail += 1 print("%i sequences were sliced and save in %s." % (count - count_fail, args.output)) if args.sliced_out: if head > 0: head_output = 'head.' + args.output with open(head_output, 'wb') as f: for record in sequences: if len(record) == 2: f.write('>%s\n' % record[0]) f.write('%s\n' % record[1][:head]) elif len(record) == 4: f.write('@%s\n' % record[0]) f.write('%s\n' % record[1][:head]) f.write('%s\n' % record[2]) f.write('%s\n' % record[3][:head]) print('The sliced head sequences wrote to %s.' % (head_output)) if tail > 0: tail_output = 'tail.' + args.output with open(tail_output, 'wb') as f: for record in sequences: seq_len = len(record[1]) if len(record) == 2: f.write('>%s\n' % record[0]) f.write('%s\n' % record[1][(seq_len - tail):]) elif len(record) == 4: f.write('@%s\n' % record[0]) f.write('%s\n' % record[1][(seq_len - tail):]) f.write('%s\n' % record[2]) f.write('%s\n' % record[3][(seq_len - tail):]) print('The sliced tail sequences wrote to %s.' % (tail_output))
def main(Namespace): import argparse import textwrap from lib import File_IO from lib import Seq_IO parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -nucl_freq') parser.add_argument('-i', '--input', help='Name of the input FASTA or FASTQ file.') parser.add_argument('-o', '--output', default='nucl_report.txt', help='Name of the reporting file.') parser.add_argument( '-tail', action='store_true', help='Indicate to also count from the tail of the sequences.') args = parser.parse_args(Namespace) input_file = args.input output_file = args.output tail_indicator = args.tail print('Reading in file: {0} ...'.format(input_file)) input_seq = File_IO.read_seqs(input_file) print('The file contains {0} sequences.'.format(len(input_seq))) if tail_indicator: print( 'Counting nucleotide frequencies from both ends of all sequences...' ) else: print( 'Counting nucleotide frequencies from the head of all sequences...' ) nucl_freq, unidentified_count = Seq_IO.nucl_freq(input_seq, tail=tail_indicator) nucl_list = ['A', 'T', 'C', 'G', 'N'] # Output the counting result header = 'Position\tA\tT\tC\tG\tN\tMost frequent\tFrequency' output_content = [header] for pos in range(len(nucl_freq)): # Get the most frequent nucleotide at this position: temp_list = [] most_freq_nucl = "" for nucl in nucl_list: temp_list.append([nucl_freq[pos][nucl], nucl]) temp_list.sort(reverse=True) most_freq_nucl = temp_list[0][1] sum_nucl_count = sum(nucl_freq[pos].values()) most_freq_nucl_freq = float( temp_list[0] [0]) / sum_nucl_count # calculate the frequency of this nucleotide # Get the output for current position current_line = [] current_line = [str(pos + 1)] for nucl in nucl_list: current_line.append(str(nucl_freq[pos][nucl])) current_line.append(most_freq_nucl) current_line.append(str(most_freq_nucl_freq)) current_line = '\t'.join(current_line) output_content.append(current_line) with open(output_file, 'wb') as f: for line in output_content: f.write("%s\n" % line) print('A report has been written to {0}'.format(output_file)) print( 'A total of {0} nucleotide has unknown letter (only uppercase was counted).' .format(unidentified_count))
def main(name_space): from lib import File_IO from lib import Seq_IO import argparse import textwrap import time parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -stat_seqs') parser.add_argument("-i", "--input", help="Name of the input sequence file") group = parser.add_mutually_exclusive_group() group.add_argument("-o", "--output", default='report.txt', help="Specify a report file for output") args = parser.parse_args(name_space) seq_file = args.input report_file = args.output print("Reading in %s ..." % seq_file) seq_content = File_IO.read_seqs(seq_file) print('Found %d sequences in this curernt file, analyzing ...' % len(seq_content)) start = time.time() #count = 0 seq_length = {} seq_ambiguous = {} seq_homop = {'All': {}, 'A': {}, 'T': {}, 'C': {}, 'G': {}} seq_total_bases = {'A': 0, 'T': 0, 'C': 0, 'G': 0} for record in seq_content: temp_seq = record[1] temp_length = len(temp_seq) try: seq_length[temp_length] += 1 except KeyError: seq_length[temp_length] = 1 temp_ambiguous = Seq_IO.count_ambiguous(temp_seq) try: seq_ambiguous[temp_ambiguous] += 1 except KeyError: seq_ambiguous[temp_ambiguous] = 1 temp_homop = Seq_IO.count_homop(temp_seq) for base in temp_homop: temp_max_length = temp_homop[base] try: seq_homop[base][temp_max_length] += 1 except KeyError: seq_homop[base][temp_max_length] = 1 temp_bases_count = Seq_IO.count_bases(temp_seq) for key in seq_total_bases: seq_total_bases[key] += temp_bases_count[key] end = time.time() used_time = round(end - start, 2) print('Finished analyzing, used %s second, printing report ...' % str(used_time)) # Make all four homopolyer distribution the same length all_bases_homop_len = [] for base in seq_homop: for base_length in seq_homop[base]: if base_length not in all_bases_homop_len: all_bases_homop_len.append(base_length) for base in seq_homop: for length in all_bases_homop_len: try: seq_homop[base][length] except KeyError: seq_homop[base][length] = 0 # Get all possible sequence length all_length = [i for i in seq_length] all_length.sort() #%% Write the report with open(report_file, 'w') as report: report.write('Report:\t%s\n' % report_file) report.write('Total number of sequence:\t%d\n\n' % len(seq_content)) report.write('#' * 100 + '\n') report.write('Ambiguous base distribution:\nNumber of N\tNumber of sequences\n') for key in sorted(seq_ambiguous.keys()): report.write('%d\t%d\n' % (key, seq_ambiguous[key])) report.write('#' * 100 + '\n') report.write('Max homopolymer distribution:\nMax homopolyer length\tAll bases\tA\tT\tC\tG\n') for key in sorted(seq_homop['A'].keys()): report.write('%s\t%d\t%d\t%d\t%d\t%d\t\n' % ( key, seq_homop['All'][key], seq_homop['A'][key], seq_homop['T'][key], seq_homop['C'][key], seq_homop['G'][key])) report.write('#' * 100 + '\n') report.write( 'Length distribution:\tMaximum length:\t%d\tMinimum length:\t%d\n' % (max(all_length), min(all_length))) report.write('Length\tNumber of sequences\n') for key in sorted(seq_length.keys(), reverse=True): report.write('%d\t%d\n' % (key, seq_length[key])) print('Report on %s can be found in %s.' % (seq_file, report_file))
def main(name_space): from lib import random_subsample as rs from lib import ParseOtuTable from lib import File_IO import argparse import textwrap import time parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -rarefy_otu_table') parser.add_argument('-otu', help='Input OTU table') parser.add_argument('-o', '--output', help='Output OTU table') parser.add_argument('-d', '--depth', help='Sampling depth for each sample') parser.add_argument('-iter', default=1, help='Iteration time for each sample') parser.add_argument('-thread', default=1, help='Number of threads') parser.add_argument('-keep_all', action='store_true', help='Indicate to keep all samples') parser.add_argument('-meta_column', default='taxonomy', help='Name of the first meta data') args = parser.parse_args(name_space) input_otu = args.otu iter_num = int(args.iter) thread = int(args.thread) meta_col = args.meta_column if args.output: output_otu = args.output else: output_otu = File_IO.name_file(input_otu, '', 'rare') otu_table = ParseOtuTable.parser_otu_table(input_otu, meta_col=meta_col) input_sample = otu_table.sample_matrix start = time.time() print('Input OTU table: %s' % input_otu) if args.depth: depth = int(args.depth) print('Sampling depth: %i' % depth) else: depth = min([sum(i[1:]) for i in input_sample]) print('Sampling depth set to the minimum abundance: %i' % depth) print('Iteration time for each OTU: %i' % iter_num) print('Threads number: %i' % thread) print('Reading in the OTU table ...') if args.keep_all: count = 0 for line in input_sample: if sum(line[1:]) < depth: count += 1 print('Found %i samples in the OTU table.' % len(input_sample)) print('%i samples has total abundance less than the sampling depth, but will be kept in the output.' % count) else: temp = [] count = 0 for line in input_sample: if sum(line[1:]) >= depth: temp.append(line) else: count += 1 input_sample = temp print('Found %i samples in the OTU table.' % len(input_sample)) print('%i samples has total abundance less than the sampling depth, and will be excluded.' % (count)) otu_id = otu_table.species_id otu_table_rarefied = [['OTU_ID'] + otu_id + otu_table.meta_id] for sample in input_sample: print('Rarefying %s ...' % sample[0]) repeat_sample = rs.repeat_rarefaction_parallel(sample[1:], depth, iter_num, processor=thread) repeat_sample.sort(key=lambda x: sum(i > 0 for i in x)) repeat_sample = [sample[0]] + repeat_sample[int(iter_num / 2)] # Pick the rarefied sample with the average richness otu_table_rarefied.append(repeat_sample[:]) otu_table_rarefied = [list(i) for i in zip(*otu_table_rarefied)] # Add meta data meta_data = otu_table.meta_dict() for line in otu_table_rarefied[1:]: for key in otu_table.meta_id: line.append(meta_data[key][line[0]]) for key in otu_table.meta_id: otu_table_rarefied[0].append(key) with open(output_otu, 'w') as f: for line in otu_table_rarefied: line = [str(i) for i in line] f.write('%s\n' % '\t'.join(line)) print('Rarefied OTU table saved in %s.' % output_otu) end = time.time() used_time = round(float(end - start), 2) time_per_sample = round(used_time / len(input_sample), 2) print('Total time used: %s seconds (%s seconds per sample)' % (str(used_time), str(time_per_sample)))
def main(Namespace): import argparse from lib import File_IO import sys import os import time import textwrap parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -merge_seqs') parser.add_argument( '-i', '--input', help='Name of the input folder containing files to be merged') parser.add_argument('-o', '--output', help='Name of the merged file') group = parser.add_mutually_exclusive_group() group.add_argument('-fasta', action='store_true', help='Set the file type to FASTA') group.add_argument( '-fastq', action='store_true', help='Set the file type to FASTQ, this is the default option') args = parser.parse_args(Namespace) input_folder = args.input if not input_folder: print('please specified an input folder') sys.exit() output_file = args.output if not output_file: print('Please specified an output file.') sys.exit() if os.path.isfile(output_file): file_size = round(os.path.getsize(output_file) / 1024**2, 0) exist = raw_input( '%s (%d MB)already exists , do you want to overwrite it? [y/n]' % (output_file, file_size)) if exist == 'y' or exist == 'Y': os.remove(output_file) else: print('Program stopped.') sys.exit() file_type = 'fastq' if args.fasta: file_type = 'fasta' start = time.time() f_list = File_IO.file_list(input_folder) f_list.sort() print('Found %i files in the folder %s' % (len(f_list), input_folder)) count = 0 n = 1 count_total = 0 for seq_file in f_list: current_file = input_folder + '/' + seq_file count = File_IO.write_seqs(File_IO.read_seqs(current_file, file_type), output_file, checker=False, overwrite=False) print('%d. Merged %d sequences from %s into the new file.' % (n, count, seq_file)) n += 1 count_total += count end = time.time() used_time = round(end - start, 2) print('Spent %s sec to merge %d records in %d files into %s' % (str(used_time), count_total, len(f_list), output_file))
def main(name_space): from lib import random_subsample as rs from lib import File_IO import argparse import textwrap parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -random_subsample') parser.add_argument('-i', '--input', help='Name of the input folder with raw data') parser.add_argument('-o', '--output', default='random_dataset', help='Name of the output folder with raw data') parser.add_argument('-file_number', default=10, help='Number of file to pick.') parser.add_argument('-size', default=10000, help='Sampling size for each file.') args = parser.parse_args(name_space) input_folder = args.input output_folder = args.output file_number = int(args.file_number) sample_size = int(args.size) # Create new folder File_IO.mk_dir(output_folder) # Randomly pick files to be sampled input_file_list = File_IO.file_list(input_folder) print('Found {0} files in the folder {1}'.format(len(input_file_list), input_folder), end='\n') file_index = rs.generate_random_index(len(input_file_list), file_number) file_list = [] for index in file_index: file_list.append(input_file_list[index]) # Randomly pick sequences from each file for raw_file in file_list: print('\tRandoming sampling {0} for {1} sequences ...'.format( raw_file, sample_size, end='\r')) current_content = File_IO.read_seqs(input_folder + '/' + raw_file) seq_index = rs.generate_random_index(len(current_content), sample_size) sampled_content = [] for index in seq_index: sampled_content.append(current_content[index]) count = File_IO.write_seqs(sampled_content, output_folder + '/' + raw_file) print( 'A randomly sampled dataset ({0} files, {1} sequences per file) was generated under the folder {2}' .format(file_number, sample_size, output_folder, end='\n'))
def main(Namespace): import argparse import textwrap from lib import File_IO from lib import Seq_IO parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -nucl_freq') parser.add_argument('-i', '--input', help='Name of the input FASTA or FASTQ file.') parser.add_argument('-o', '--output', default='nucl_report.txt', help='Name of the reporting file.') parser.add_argument('-tail', action='store_true', help='Indicate to also count from the tail of the sequences.') args = parser.parse_args(Namespace) input_file = args.input output_file = args.output tail_indicator = args.tail print('Reading in file: {0} ...'.format(input_file)) input_seq = File_IO.read_seqs(input_file) print('The file contains {0} sequences.'.format(len(input_seq))) if tail_indicator: print('Counting nucleotide frequencies from both ends of all sequences...') else: print('Counting nucleotide frequencies from the head of all sequences...') nucl_freq, unidentified_count = Seq_IO.nucl_freq(input_seq, tail = tail_indicator) nucl_list = ['A','T','C','G','N'] # Output the counting result header = 'Position\tA\tT\tC\tG\tN\tMost frequent\tFrequency' output_content = [header] for pos in range(len(nucl_freq)): # Get the most frequent nucleotide at this position: temp_list = [] most_freq_nucl = "" for nucl in nucl_list: temp_list.append([nucl_freq[pos][nucl],nucl]) temp_list.sort(reverse=True) most_freq_nucl = temp_list[0][1] sum_nucl_count = sum(nucl_freq[pos].values()) most_freq_nucl_freq = float(temp_list[0][0]) / sum_nucl_count # calculate the frequency of this nucleotide # Get the output for current position current_line = [] current_line = [str(pos + 1)] for nucl in nucl_list: current_line.append(str(nucl_freq[pos][nucl])) current_line.append(most_freq_nucl) current_line.append(str(most_freq_nucl_freq)) current_line = '\t'.join(current_line) output_content.append(current_line) with open(output_file, 'w') as f: for line in output_content: f.write("%s\n" %line) print('A report has been written to {0}'.format(output_file)) print('A total of {0} nucleotide has unknown letter (only uppercase was counted).'.format(unidentified_count))
def main(Namespace): import argparse import textwrap from lib import ParseOtuMap from lib import File_IO import sys parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -filter_otu_map') parser.add_argument('-i', '--input', help='Input OTU map') parser.add_argument('-o', '--output', help='Output OTU map') group = parser.add_mutually_exclusive_group() group.add_argument('-min_size', default=2, help='The minimum size of an OTU to be kept') group.add_argument('-name_list', help='A file contains a list of sequence name to be picked') group.add_argument('-fasta', help='A FASTA file contains sequence to be picked') args = parser.parse_args(Namespace) map_file = args.input output_file = args.output if args.min_size: min_size = int(args.min_size) if args.name_list: with open(args.name_list, 'rU') as f: pick_list = [] for line in f: pick_list.append(line.strip('\n')) if args.fasta: seqs = File_IO.read_seqs(args.fasta) pick_list = [] for record in seqs: pick_list.append(record[0]) print('Reading in %s ...' % map_file) MapDict = ParseOtuMap.read_otu_map(map_file) # Filter OTU map based on parameters if args.min_size: print('Filtering OTUs with less than %d sequences ...' % min_size) MapDictFiltered = ParseOtuMap.filter_by_size(MapDict, min_size=min_size) if args.name_list or args.fasta: print('Pick OTUs based on the names in %s ...' % args.name_list) MapDictFiltered = {} for name in pick_list: try: MapDictFiltered[name] = MapDict[name] except KeyError: print('Cannot find %s in the OTU map. Program exits.') sys.exit() # Report comparison of original and filtered maps old_map = ParseOtuMap.otu_map_parser(MapDict) new_map = ParseOtuMap.otu_map_parser(MapDictFiltered) print('\n') print('Original OTU map:') print('\t OTU=%i (Total Sequences=%i, Max=%i, Min=%i, Ave=%i)' % ( old_map.derep_count, old_map.seqs_count, old_map.max_derep, old_map.min_derep, old_map.ave_derep)) print('Filtered OTU map:') print('\t OTU=%i (Total Sequences=%i, Max=%i, Min=%i, Ave=%i)' % ( new_map.derep_count, new_map.seqs_count, new_map.max_derep, new_map.min_derep, new_map.ave_derep)) print('Writing new map ...') ParseOtuMap.write_otu_map(MapDictFiltered, output_file=output_file) print('New map saved in %s.' % output_file)
def main(name_space): import argparse import textwrap from lib import ParseOtuMap from lib import File_IO parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent('''\ ------------------------ By Zewei Song University of Minnesota Dept. Plant Pathology [email protected] ------------------------'''), prog='fast.py -subset_fast_hybrid') parser.add_argument('-i', '--input', help='Input of FAST hybrid map.') parser.add_argument('-o', '--output', help='Output prefix for the derep map and sequence') group = parser.add_mutually_exclusive_group() group.add_argument('-otu_list', help='A list of OTU names seperated by ","') group.add_argument('-otu_file', help='A file contains a list of OTU names (no header)') args = parser.parse_args(name_space) print('Subtracting a FAST hybrid map with provieded OTU names ...') input_file = args.input output_derep = args.output + '.txt' output_fasta = args.output + '.fasta' otu_list = [] if args.otu_list: otu_list = args.otu_list.split(',') elif args.otu_file: otu_list = [] with open(args.otu_file) as f: for line in f: otu_list.append(line) print('Found {0} OTU names.'.format(len(otu_list))) print('Reading in the FAST hybrid map: {0} ...'.format(input_file)) hybrid_map = ParseOtuMap.read_fast_output(input_file) fast_derep = {} for otu in otu_list: fast_derep.update(hybrid_map[otu]['sample']) ParseOtuMap.write_fast_output(fast_derep, output_derep) print('A FAST derep map wrote to: {0}.'.format(output_derep)) derep_seq = [] for key, value in fast_derep.items(): current_seq = [] derep_size = sum(value['sample'].values()) seq_label = key + ';size=' + str(derep_size) current_seq = [derep_size, seq_label, value['seq']] derep_seq.append(current_seq) derep_seq.sort(reverse=True) derep_seq = [i[1:] for i in derep_seq] count = File_IO.write_seqs(derep_seq, output_fasta, checker=False) print( 'A dereplicated FASTA file wrote to {0}, containing {1} sequences with size annotation.' .format(output_fasta, count)) print('\n')