def test_aligned(self): d, e = seqsum.summarize_distances(self.seqs, sample_size=0, per_site=False, aligned=True, ignore_gaps=True) self.assertEqual(e, []) self.assertEqual(sorted(d.keys()), sorted(self.expected_means.keys())) for k in iterkeys(d): self.assertEqual(d[k].maximum, self.expected_maxs[k]) self.assertAlmostEqual(d[k].mean, self.expected_means[k])
def digest_seq(recognition_seq, seq_record, out_dir, append_dict, extra_length=0, min_length=0, max_length=None, include_overhang=True,): if max_length: ml = str(max_length) else: ml = 'max' _LOG.info("Digesting seq {0} with recognition seq {1}...".format( seq_record.id, str(recognition_seq.seq))) digest = DigestSummary(recognition_seq=recognition_seq, seq_record=seq_record, extra_length=extra_length, include_overhang=include_overhang) digest_total = 0 digest_filter = 0 out_file_path = os.path.join(out_dir, ".".join([digest.molecule_name, 'txt'])) out = open(out_file_path, 'w') out.write("{0}\t{1}\n".format('fragment_length', 'frequency')) for l in sorted(iterkeys(digest.length_distribution)): f = digest.length_distribution[l] out.write("{0}\t{1}\n".format(l, f)) digest_total += f if max_length: if l <= max_length and l >= min_length: digest_filter += f else: if l >= min_length: digest_filter += f if l not in append_dict: append_dict[l] = 0 append_dict[l] += f out.close() _LOG.info('\nMolecule id: {0}\n'.format(digest.molecule_id) + 'Molecule name: {0}\n'.format(digest.molecule_name) + 'Molecule description: {0}\n'.format( digest.molecule_description) + 'Molecule length: {0}\n'.format(digest.molecule_length) + '\ttotal fragments: {0}\n'.format(digest_total) + '\tfragments of length {0}-{1}: {2}\n'.format( min_length, ml, digest_filter) + '\tfragment length distribution written to {0}'.format( out_file_path)) return append_dict, digest_total, digest_filter, digest.molecule_length
def digest_seq( recognition_seq, seq_record, out_dir, append_dict, extra_length=0, min_length=0, max_length=None, include_overhang=True, ): if max_length: ml = str(max_length) else: ml = 'max' _LOG.info("Digesting seq {0} with recognition seq {1}...".format( seq_record.id, str(recognition_seq.seq))) digest = DigestSummary(recognition_seq=recognition_seq, seq_record=seq_record, extra_length=extra_length, include_overhang=include_overhang) digest_total = 0 digest_filter = 0 out_file_path = os.path.join(out_dir, ".".join([digest.molecule_name, 'txt'])) out = open(out_file_path, 'w') out.write("{0}\t{1}\n".format('fragment_length', 'frequency')) for l in sorted(iterkeys(digest.length_distribution)): f = digest.length_distribution[l] out.write("{0}\t{1}\n".format(l, f)) digest_total += f if max_length: if l <= max_length and l >= min_length: digest_filter += f else: if l >= min_length: digest_filter += f if l not in append_dict: append_dict[l] = 0 append_dict[l] += f out.close() _LOG.info( '\nMolecule id: {0}\n'.format(digest.molecule_id) + 'Molecule name: {0}\n'.format(digest.molecule_name) + 'Molecule description: {0}\n'.format(digest.molecule_description) + 'Molecule length: {0}\n'.format(digest.molecule_length) + '\ttotal fragments: {0}\n'.format(digest_total) + '\tfragments of length {0}-{1}: {2}\n'.format(min_length, ml, digest_filter) + '\tfragment length distribution written to {0}'.format(out_file_path)) return append_dict, digest_total, digest_filter, digest.molecule_length
def test_full_alignment_muscle(self): if not functions.which('muscle'): _LOG.warning('muscle not found... skipping tests.') return d, e = seqsum.summarize_distances(self.seqs, sample_size=0, per_site=False, aligned=False, ignore_gaps=True, do_full_alignment=True, aligner_tools=['muscle']) self.assertEqual(e, []) self.assertEqual(sorted(d.keys()), sorted(self.expected_means.keys())) for k in iterkeys(d): self.assertEqual(d[k].maximum, self.expected_maxs[k]) self.assertAlmostEqual(d[k].mean, self.expected_means[k])
def test_extra_length(self): expected = { 'JF314863': { 23: 2, 75: 1, 185: 1,}, 'JF314864': { 23: 1, 75: 1, 190: 1,}, 'JF314865': { 23: 1, 85: 1, 188: 1,}, 'JF314866': { 23: 2, 75: 1, 185: 1,}, 'combined' : { 23: 6, 75: 3, 85: 1, 185: 2, 188: 1, 190: 1,}} rs = 'TAG' cs = '3' self.exe_seqdigest(['-s', rs, '-c', cs, '-g', '354698776,354698778', '-x', '10', package_paths.data_path('JF314865-JF314866.gb')]) results = {} for k in iterkeys(expected): result_file_path = os.path.join(self.test_dir, ".".join([k, 'txt'])) self.appendTestFile(result_file_path) results[k] = self.parse_result_file(result_file_path) self.assertEqual(expected, results)
def test_accessions(self): expected = { 'JF314863': { 13: 2, 65: 1, 175: 1,}, 'JF314864': { 13: 1, 65: 1, 180: 1,}, 'JF314865': { 13: 1, 75: 1, 178: 1,}, 'JF314866': { 13: 2, 65: 1, 175: 1,}, 'combined' : { 13: 6, 65: 3, 75: 1, 175: 2, 178: 1, 180: 1,}} rs = 'TAG' cs = '3' self.exe_seqdigest(['-s', rs, '-c', cs, '-a', 'JF314863,JF314864', package_paths.data_path('JF314865-JF314866.gb')]) results = {} for k in iterkeys(expected): result_file_path = os.path.join(self.test_dir, ".".join([k, 'txt'])) self.appendTestFile(result_file_path) results[k] = self.parse_result_file(result_file_path) self.assertEqual(expected, results)
def main(): description = '{name} {version}'.format(**_program_info) usage = ("\n %prog [options] -s <RECOGNITION_SEQUENCE> [<GENBANK_FILE1> " "<GENBANK_FILE2> ...]") parser = OptionParser(usage=usage, description=description, version=_program_info['version'], add_help_option=True) parser.add_option("-s", "--recognition_seq", dest="recognition_seq", type="string", help="Recognition sequence, 5' to 3', of restriction enzyme.") parser.add_option("-c", "--cut_site", dest="cut_site", type="int", help=("One-based index of the last base before the cut site in the " "recognition sequence. E.g., NotI: 5'---GC \ GGCCGC---3' " "has a cut site of 2, and would be passed to this program " "with '-s GCGGCCGC -c 2'.")) parser.add_option("-a", "--accessions", dest="accessions", type="string", help=("GenBank accession numbers. " "E.g., -g JF314862,JF314864-314866 -OR-" " -g 'JF314862, JF314864-314866'")) parser.add_option("-g", "--gi_numbers", dest="gi_numbers", type="string", help=("GenBank GI numbers. E.g., -g 354698774,354698776-354698779 -OR-" " -g '354698774, 354698776-354698779'")) parser.add_option("--format", dest="format", type="string", help=("Format of sequence files. Valid options are 'fasta' and 'gb', " "for fasta- and genbank-formatted files, respectively. The " "default is to guess the format based on your file extensions; " ".fas or .fasta translate to 'fasta', whereas .gb or .genbank " "translate to 'gb'. This option overrides the default behavior, " "in which case, all files must be of the format provided with " "this flag.")) parser.add_option("-x", "--extra_length", dest="extra_length", type="int", default=0, help=("Extra length (in bases) to add to each fragment. For example, " "you can include the length of oligos ligated to each fragment. " "This length is only added once, so if you want to simulate the " "ligation of oligos to both ends, of each fragment, provide the " "TOTAL length of the oligos ligated to each fragment. Do not " "add length for the overhang left by the restriction enzyme. " "That is handled by the program.")) parser.add_option("--min_length", dest="min_length", type="int", default=0, help="Minimum fragment length to include in count.") parser.add_option("--max_length", dest="max_length", type="int", help="Maximum fragment length to include in count.") parser.add_option("-o", "--output_dir", dest="output_dir", type="string", help="Path to output directory. Default is './digests/'") parser.add_option("-d", "--debugging", dest="debugging", default=False, action="store_true", help="Run in debugging mode.") (options, args) = parser.parse_args() if not options.recognition_seq or not options.cut_site: _LOG.error("You must provide a recognition sequence and cut site") sys.stderr.write(str(parser.print_help())) sys.exit(1) if not options.gi_numbers and len(args) < 1: _LOG.error("You must provide a sequence to digest, either via the " "gi number option or sequence file arguments") sys.stderr.write(str(parser.print_help())) sys.exit(1) if options.format: format = EXTENSIONS[options.format.lower()] if not options.output_dir: out_dir = os.path.abspath(os.path.join(os.path.curdir, 'digests')) else: out_dir = os.path.expanduser(os.path.expandvars(options.output_dir)) mkdr(out_dir) if not os.path.isdir(out_dir): _LOG.error("Output path {0} is not a directory".format(out_dir)) sys.exit(1) if options.max_length and options.max_length < options.min_length: _LOG.error( "max_length ({0}) cannot be less than min_length ({1})".format( options.max_length, options.min_length)) sys.exit(1) if options.max_length: ml = str(options.max_length) else: ml = 'max' if options.debugging and _PS: proc = psutil.Process(os.getpid()) max_mem = proc.get_memory_info().rss t_start = datetime.datetime.now() rseq = RecognitionSeq(str(options.recognition_seq), options.cut_site) gi_list = [] if options.gi_numbers: gi_list = parse_gi_numbers(options.gi_numbers) if options.accessions: gi_list += parse_accession_numbers(options.accessions) combined = {} digested = [] total_count = 0 filter_count = 0 total_length = 0 for gi in gi_list: _LOG.info("Downloading gi {0}...".format(gi)) seq_iter = fetch_gb_seqs(str(gi), data_type='dna') for seq in seq_iter: digested.append(seq.id) combined, count, fcount, mol_length = digest_seq( recognition_seq = rseq, seq_record = seq, out_dir = out_dir, append_dict = combined, extra_length = options.extra_length, min_length = options.min_length, max_length = options.max_length, include_overhang = True) total_count += count filter_count += fcount total_length += mol_length if options.debugging and _PS: max_mem = max([max_mem, proc.get_memory_info().rss]) for file_path in args: try: f = OpenFile(file_path, 'r') except: _LOG.error("Could not open file {0}... skipping!".format( file_path)) continue if not options.format: format = EXTENSIONS[file_path.split('.')[-1].strip().lower()] seq_iter = dataio.get_seq_iter_from_file(f, format=format, data_type='dna', ambiguities=True) for seq in seq_iter: if seq.id in digested: _LOG.warning( "Sequence {0} already digested... skipping!".format( seq.id)) continue digested.append(seq.id) combined, count, fcount, mol_length = digest_seq( recognition_seq = rseq, seq_record = seq, out_dir = out_dir, append_dict = combined, extra_length = options.extra_length, min_length = options.min_length, max_length = options.max_length, include_overhang = True) total_count += count filter_count += fcount total_length += mol_length if options.debugging and _PS: max_mem = max([max_mem, proc.get_memory_info().rss]) f.close() _LOG.info('Finished digests!') out_file_path = os.path.join(out_dir, 'combined.txt') out = open(out_file_path, 'w') out.write("{0}\t{1}\n".format('fragment_length', 'frequency')) for l in sorted(iterkeys(combined)): f = combined[l] out.write("{0}\t{1}\n".format(l, f)) out.close() _LOG.info('\nSummary over ALL molecules:\n' '\ttotal length: {0}\n'.format(total_length) + '\ttotal fragments: {0}\n'.format(total_count) + '\tfragments of length {0}-{1}: {2}\n'.format( options.min_length, ml, filter_count) + '\tfragment length distribution written to {0}\n'.format( out_file_path)) t_end = datetime.datetime.now() _LOG.info('start time: {0}\n' 'end time: {1}\n' 'run time: {2}\n'.format(str(t_start), str(t_end), str(t_end-t_start))) if options.debugging and _PS: _LOG.info('max memory (MB): {0}'.format(float(max_mem)/1048576))
def main(): description = '{name} {version}'.format(**_program_info) usage = ("\n %prog [options] -s <RECOGNITION_SEQUENCE> [<GENBANK_FILE1> " "<GENBANK_FILE2> ...]") parser = OptionParser(usage=usage, description=description, version=_program_info['version'], add_help_option=True) parser.add_option( "-s", "--recognition_seq", dest="recognition_seq", type="string", help="Recognition sequence, 5' to 3', of restriction enzyme.") parser.add_option( "-c", "--cut_site", dest="cut_site", type="int", help=("One-based index of the last base before the cut site in the " "recognition sequence. E.g., NotI: 5'---GC \ GGCCGC---3' " "has a cut site of 2, and would be passed to this program " "with '-s GCGGCCGC -c 2'.")) parser.add_option("-a", "--accessions", dest="accessions", type="string", help=("GenBank accession numbers. " "E.g., -g JF314862,JF314864-314866 -OR-" " -g 'JF314862, JF314864-314866'")) parser.add_option( "-g", "--gi_numbers", dest="gi_numbers", type="string", help=("GenBank GI numbers. E.g., -g 354698774,354698776-354698779 -OR-" " -g '354698774, 354698776-354698779'")) parser.add_option( "--format", dest="format", type="string", help=("Format of sequence files. Valid options are 'fasta' and 'gb', " "for fasta- and genbank-formatted files, respectively. The " "default is to guess the format based on your file extensions; " ".fas or .fasta translate to 'fasta', whereas .gb or .genbank " "translate to 'gb'. This option overrides the default behavior, " "in which case, all files must be of the format provided with " "this flag.")) parser.add_option( "-x", "--extra_length", dest="extra_length", type="int", default=0, help=("Extra length (in bases) to add to each fragment. For example, " "you can include the length of oligos ligated to each fragment. " "This length is only added once, so if you want to simulate the " "ligation of oligos to both ends, of each fragment, provide the " "TOTAL length of the oligos ligated to each fragment. Do not " "add length for the overhang left by the restriction enzyme. " "That is handled by the program.")) parser.add_option("--min_length", dest="min_length", type="int", default=0, help="Minimum fragment length to include in count.") parser.add_option("--max_length", dest="max_length", type="int", help="Maximum fragment length to include in count.") parser.add_option("-o", "--output_dir", dest="output_dir", type="string", help="Path to output directory. Default is './digests/'") parser.add_option("-d", "--debugging", dest="debugging", default=False, action="store_true", help="Run in debugging mode.") (options, args) = parser.parse_args() if not options.recognition_seq or not options.cut_site: _LOG.error("You must provide a recognition sequence and cut site") sys.stderr.write(str(parser.print_help())) sys.exit(1) if not options.gi_numbers and len(args) < 1: _LOG.error("You must provide a sequence to digest, either via the " "gi number option or sequence file arguments") sys.stderr.write(str(parser.print_help())) sys.exit(1) if options.format: format = EXTENSIONS[options.format.lower()] if not options.output_dir: out_dir = os.path.abspath(os.path.join(os.path.curdir, 'digests')) else: out_dir = os.path.expanduser(os.path.expandvars(options.output_dir)) mkdr(out_dir) if not os.path.isdir(out_dir): _LOG.error("Output path {0} is not a directory".format(out_dir)) sys.exit(1) if options.max_length and options.max_length < options.min_length: _LOG.error( "max_length ({0}) cannot be less than min_length ({1})".format( options.max_length, options.min_length)) sys.exit(1) if options.max_length: ml = str(options.max_length) else: ml = 'max' if options.debugging and _PS: proc = psutil.Process(os.getpid()) max_mem = proc.get_memory_info().rss t_start = datetime.datetime.now() rseq = RecognitionSeq(str(options.recognition_seq), options.cut_site) gi_list = [] if options.gi_numbers: gi_list = parse_gi_numbers(options.gi_numbers) if options.accessions: gi_list += parse_accession_numbers(options.accessions) combined = {} digested = [] total_count = 0 filter_count = 0 total_length = 0 for gi in gi_list: _LOG.info("Downloading gi {0}...".format(gi)) seq_iter = fetch_gb_seqs(str(gi), data_type='dna') for seq in seq_iter: digested.append(seq.id) combined, count, fcount, mol_length = digest_seq( recognition_seq=rseq, seq_record=seq, out_dir=out_dir, append_dict=combined, extra_length=options.extra_length, min_length=options.min_length, max_length=options.max_length, include_overhang=True) total_count += count filter_count += fcount total_length += mol_length if options.debugging and _PS: max_mem = max([max_mem, proc.get_memory_info().rss]) for file_path in args: try: f = OpenFile(file_path, 'r') except: _LOG.error( "Could not open file {0}... skipping!".format(file_path)) continue if not options.format: format = EXTENSIONS[file_path.split('.')[-1].strip().lower()] seq_iter = dataio.get_seq_iter_from_file(f, format=format, data_type='dna', ambiguities=True) for seq in seq_iter: if seq.id in digested: _LOG.warning( "Sequence {0} already digested... skipping!".format( seq.id)) continue digested.append(seq.id) combined, count, fcount, mol_length = digest_seq( recognition_seq=rseq, seq_record=seq, out_dir=out_dir, append_dict=combined, extra_length=options.extra_length, min_length=options.min_length, max_length=options.max_length, include_overhang=True) total_count += count filter_count += fcount total_length += mol_length if options.debugging and _PS: max_mem = max([max_mem, proc.get_memory_info().rss]) f.close() _LOG.info('Finished digests!') out_file_path = os.path.join(out_dir, 'combined.txt') out = open(out_file_path, 'w') out.write("{0}\t{1}\n".format('fragment_length', 'frequency')) for l in sorted(iterkeys(combined)): f = combined[l] out.write("{0}\t{1}\n".format(l, f)) out.close() _LOG.info('\nSummary over ALL molecules:\n' '\ttotal length: {0}\n'.format(total_length) + '\ttotal fragments: {0}\n'.format(total_count) + '\tfragments of length {0}-{1}: {2}\n'.format( options.min_length, ml, filter_count) + '\tfragment length distribution written to {0}\n'.format( out_file_path)) t_end = datetime.datetime.now() _LOG.info('start time: {0}\n' 'end time: {1}\n' 'run time: {2}\n'.format(str(t_start), str(t_end), str(t_end - t_start))) if options.debugging and _PS: _LOG.info('max memory (MB): {0}'.format(float(max_mem) / 1048576))