def test_all_orfs(self): '''Test all_orfs()''' d = {} tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_orfs.fa'), d) seq = d['1'] orfs = seq.all_orfs(min_length=120) expected = [ (intervals.Interval(27, 221), False), (intervals.Interval(44, 226), False), (intervals.Interval(48, 170), True), (intervals.Interval(109, 240), False), (intervals.Interval(143, 265), True), (intervals.Interval(227, 421), False), (intervals.Interval(277, 432), True), (intervals.Interval(286, 477), False), (intervals.Interval(288, 518), True), (intervals.Interval(562, 702), False), (intervals.Interval(600, 758), False), (intervals.Interval(605, 817), False), (intervals.Interval(818, 937), False), (intervals.Interval(835, 987), False), (intervals.Interval(864, 998), False) ] self.assertEqual(len(orfs), len(expected)) for i in range(len(orfs)): print(orfs[i][0], expected[i][0]) self.assertEqual(orfs[i][0], expected[i][0]) self.assertEqual(orfs[i][1], expected[i][1])
def __init__(self, fasta_file, working_directory=None, cutoff_contig_length=2000, percent_match=95, skip = None, summary_file="contig_cleanup_summary.txt", summary_prefix="[contig cleanup]", debug=False): ''' Constructor ''' self.fasta_file = fasta_file self.working_directory = working_directory if working_directory else os.getcwd() self.cutoff_contig_length = cutoff_contig_length self.percent_match = percent_match self.summary_file = summary_file self.summary_prefix = summary_prefix self.debug = debug self.contigs = {} tasks.file_to_dict(self.fasta_file, self.contigs) #Read contig ids and sequences into dict self.ids_to_skip = set() if skip: if type(skip) == set: self.ids_to_skip = set(skip) # Assumes ids is a list else: fh = fastaqutils.open_file_read(skip) for line in fh: self.ids_to_skip.add(line.rstrip()) fastaqutils.close(fh) self.output_file = self._build_final_filename()
def test_file_to_dict(self): '''check file_to_dict fills dictionary correctly''' d_test = {} d = {} tasks.file_to_dict(os.path.join(data_dir, 'sequences_test.fa'), d_test) for i in range(1,5): d[str(i)] = sequences.Fasta(str(i),'ACGTA') self.assertSequenceEqual(d_test.keys(),d.keys()) for i in range(1,5): key = str(i) self.assertEqual(d_test[key].id, d[key].id) self.assertEqual(d_test[key].seq, d[key].seq)
def test_capillary_to_pairs(self): '''Check that capillary reads file converted to paired and unpaired''' tmp_prefix = 'tmp.cap_to_pairs' tasks.capillary_to_pairs( os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa'), tmp_prefix) # sequences have been hashed, so could be in any order in # output files. So need to check contents of files are OK d_correct_paired = {} d_correct_unpaired = {} tasks.file_to_dict( os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa.paired.gz'), d_correct_paired) tasks.file_to_dict( os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa.unpaired.gz'), d_correct_unpaired) d_test_paired = {} d_test_unpaired = {} tasks.file_to_dict(tmp_prefix + '.paired.gz', d_test_paired) tasks.file_to_dict(tmp_prefix + '.unpaired.gz', d_test_unpaired) self.assertDictEqual(d_test_paired, d_correct_paired) self.assertDictEqual(d_test_unpaired, d_correct_unpaired) os.unlink(tmp_prefix + '.paired.gz') os.unlink(tmp_prefix + '.unpaired.gz')
def files_are_equal(file1, file2): seqs1 = {} seqs2 = {} tasks.file_to_dict(file1, seqs1) tasks.file_to_dict(file2, seqs2) if len(seqs1) != len(seqs2): return False for name in seqs1: seq1 = seqs1[name] seq2 = seqs2[name] if seq1.id != seq2.id: return False if len(seq1) != len(seq2): return False return True
def test_finding_dnaA(self): tests = [ [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join(data_dir, "BREAKING_input_0.fa"), gene_file=os.path.join(data_dir, "test_dnaA_1.fa"), choose_random_gene=False, rename=False, skip=os.path.join(data_dir, "BREAKING_skip_ids_0.txt")), 'BREAKING_input_0.fa' ], #skip 1, dnaA normal - nothing should change [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join(data_dir, "BREAKING_input_0.fa"), gene_file=os.path.join(data_dir, "test_dnaA_1.fa"), choose_random_gene=False, rename=False, skip=os.path.join(data_dir, "BREAKING_skip_ids_0_all.txt")), 'BREAKING_input_0.fa' ], #skip all - - nothing should change [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join(data_dir, "BREAKING_input_0.fa"), gene_file=os.path.join(data_dir, "test_dnaA_1.fa"), rename=False, choose_random_gene=False, ), 'BREAKING_output_0_all.fa' ], #skip none - test1 contig should be circularised ] for t in tests: t[0].run() self.assertTrue(os.path.isfile(t[0].output_file)) self.assertTrue(os.path.isfile(t[0].summary_file)) # Read expected output file and compare sequences expected_contigs = {} tasks.file_to_dict(os.path.join(data_dir, t[1]), expected_contigs) for id in expected_contigs.keys(): # print(id + "\n") # print("Expected: " + expected_contigs[id].seq) # print("Got : " + t[0].contigs[id].seq) self.assertTrue(expected_contigs[id] == t[0].contigs[id]) os.remove(t[0].output_file) os.remove(t[0].summary_file)
def __init__(self, fasta_file='', working_directory=None, contigs={}, alignments=[], trim = True, trim_reversed_overlaps = False, overlap_offset=1000, overlap_boundary_max=50, overlap_min_length=1000, overlap_max_length=3000, overlap_percent_identity=85, min_trim_length=0.89, skip = None, summary_file = "contig_trimming_summary.txt", summary_prefix = '[contig trimmer]', debug=False): ''' Constructor ''' self.fasta_file = fasta_file self.working_directory = working_directory if working_directory else os.getcwd() self.contigs = contigs self.alignments = alignments self.trim = trim self.trim_reversed_overlaps = trim_reversed_overlaps self.overlap_offset = overlap_offset self.overlap_boundary_max = overlap_boundary_max * 0.01 self.overlap_min_length = overlap_min_length self.overlap_max_length = overlap_max_length self.overlap_percent_identity = overlap_percent_identity self.min_trim_length = min_trim_length self.ids_to_skip = utils.parse_file_or_set(skip) self.summary_file = summary_file self.summary_prefix = summary_prefix self.output_file = self._build_final_filename() self.debug = debug # Extract contigs if not self.contigs: self.contigs = {} tasks.file_to_dict(self.fasta_file, self.contigs)
def __init__( self, fasta_file, gene_file, skip=None, #Avoid circularising contigs with these ids hit_percent_id=80, match_length_percent=100, choose_random_gene=True, rename=True, working_directory=None, summary_file="contig_breaks_summary.txt", summary_prefix="[contig break finder]", debug=False): ''' Attributes ''' self.fasta_file = fasta_file self.gene_file = gene_file self.hit_percent_id = hit_percent_id self.match_length_percent = match_length_percent self.choose_random_gene = choose_random_gene self.rename = rename self.working_directory = working_directory if working_directory else os.getcwd( ) self.summary_file = summary_file self.summary_prefix = summary_prefix self.output_file = self._build_final_filename() self.debug = debug self.contigs = {} tasks.file_to_dict( self.fasta_file, self.contigs) #Read contig ids and sequences into dict self.random_gene_starts = {} self.ids_to_skip = set() if skip: if type(skip) == set: self.ids_to_skip = set(skip) # Assumes ids is a list else: fh = fastaqutils.open_file_read(skip) for line in fh: self.ids_to_skip.add(line.rstrip()) fastaqutils.close(fh)
def test_contig_overlap_trimmer(self): '''Test contig overlap trimming''' # test data test_fasta_file = os.path.join(data_dir, "TRIMMING_input_1.fa") test_overlap_coords = ['\t'.join(['1', '4', '57', '60', '4', '4', '100.00', '60', '60', '1', '1', 'contig1', 'contig1']), '\t'.join(['1', '4', '57', '60', '4', '4', '100.00', '60', '60', '1', '1', 'contig2', 'contig2']), '\t'.join(['2', '4', '57', '59', '3', '3', '100.00', '60', '60', '1', '1', 'contig3', 'contig3']), '\t'.join(['2', '4', '54', '56', '3', '3', '100.00', '60', '60', '1', '1', 'contig4', 'contig4']), '\t'.join(['1', '3', '58', '60', '3', '3', '100.00', '60', '60', '1', '1', 'contig4', 'contig4']), '\t'.join(['1', '4', '57', '60', '4', '4', '100.00', '60', '60', '1', '1', 'contig5', 'contig5']), '\t'.join(['1', '2', '59', '60', '2', '2', '100.00', '60', '60', '1', '1', 'contig6', 'contig6']), #Overlap too short '\t'.join(['1', '12', '49', '60', '12', '12', '100.00', '60', '60', '1', '1', 'contig7', 'contig7']), # Trimmed length would be too short '\t'.join(['1', '3', '60', '58', '3', '3', '100.00', '60', '60', '-1', '-1', 'contig8', 'contig8']), #overlap reversed # No overlap for contig 9 '\t'.join(['4', '7', '36', '38', '4', '4', '100.00', '60', '60', '1', '-1', 'contig10', 'contig10']), #beyond offset ] test_overlap_alignments = [alignment.Alignment(coord) for coord in test_overlap_coords] overlap_trimmer = contig_overlap_trimmer.ContigOverlapTrimmer(fasta_file = test_fasta_file, alignments = test_overlap_alignments, overlap_offset = 10, overlap_min_length=3, overlap_max_length=12, ) overlap_trimmer.run() self.assertTrue(os.path.isfile(overlap_trimmer.output_file)) self.assertTrue(os.path.isfile(overlap_trimmer.summary_file)) expected_contigs = {} tasks.file_to_dict(os.path.join(data_dir, "TRIMMING_output_1.fa"), expected_contigs) for id in expected_contigs.keys(): self.assertTrue(expected_contigs[id] == overlap_trimmer.contigs[id]) os.remove(overlap_trimmer.output_file) os.remove(overlap_trimmer.summary_file)
def test_capillary_to_pairs(self): '''Check that capillary reads file converted to paired and unpaired''' tmp_prefix = 'tmp.cap_to_pairs' tasks.capillary_to_pairs(os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa'), tmp_prefix) # sequences have been hashed, so could be in any order in # output files. So need to check contents of files are OK d_correct_paired = {} d_correct_unpaired = {} tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa.paired.gz'), d_correct_paired) tasks.file_to_dict(os.path.join(data_dir, 'sequences_test_cap_to_read_pairs.fa.unpaired.gz'), d_correct_unpaired) d_test_paired = {} d_test_unpaired = {} tasks.file_to_dict(tmp_prefix + '.paired.gz', d_test_paired) tasks.file_to_dict(tmp_prefix + '.unpaired.gz', d_test_unpaired) self.assertDictEqual(d_test_paired, d_correct_paired) self.assertDictEqual(d_test_unpaired, d_correct_unpaired) os.unlink(tmp_prefix + '.paired.gz') os.unlink(tmp_prefix + '.unpaired.gz')
help='Minimum length of contig to output [%(default)s]', default=200) parser.add_argument('--nucmer_options', help='Options when running nucmer [%(default)s]', default='') parser.add_argument('contigs_fa', help='Name of contigs fasta file', metavar='contigs.fa') parser.add_argument('ref_fa', help='Name of reference fasta file', metavar='reference.fa') parser.add_argument('outprefix', help='Prefix of output files') options = parser.parse_args() ref_seqs = {} tasks.file_to_dict(options.ref_fa, ref_seqs) nucmer_out_prefix = options.outprefix + '.nucmer' nucmer_out_delta = nucmer_out_prefix + '.delta' nucmer_out_filter = nucmer_out_prefix + '.delta-filter' nucmer_out_coords = nucmer_out_filter + '.coords' # run nucmer of contigs vs ref utils.syscall(' '.join([ 'nucmer', options.nucmer_options, '-p', nucmer_out_prefix, options.ref_fa, options.contigs_fa ])) utils.syscall(' '.join([ 'delta-filter', '-i 98 -l 180 -q', nucmer_out_delta, '>', nucmer_out_filter ])) utils.syscall(' '.join(
contigs[(nucmer_hit.ref_name, nucmer_hit.ref_start, nucmer_hit.ref_end)] = contig parser = argparse.ArgumentParser( description="Takes contigs and a reference sequence. Makes a new fasta file of the contigs, but they are now perfect sequences by using the reference instead", usage="%(prog)s [options] <contigs.fa> <reference.fa> <outprefix>", ) parser.add_argument("--min_seq_length", type=int, help="Minimum length of contig to output [%(default)s]", default=200) parser.add_argument("--nucmer_options", help="Options when running nucmer [%(default)s]", default="") parser.add_argument("contigs_fa", help="Name of contigs fasta file", metavar="contigs.fa") parser.add_argument("ref_fa", help="Name of reference fasta file", metavar="reference.fa") parser.add_argument("outprefix", help="Prefix of output files") options = parser.parse_args() ref_seqs = {} tasks.file_to_dict(options.ref_fa, ref_seqs) nucmer_out_prefix = options.outprefix + ".nucmer" nucmer_out_delta = nucmer_out_prefix + ".delta" nucmer_out_filter = nucmer_out_prefix + ".delta-filter" nucmer_out_coords = nucmer_out_filter + ".coords" # run nucmer of contigs vs ref utils.syscall(" ".join(["nucmer", options.nucmer_options, "-p", nucmer_out_prefix, options.ref_fa, options.contigs_fa])) utils.syscall(" ".join(["delta-filter", "-i 98 -l 180 -q", nucmer_out_delta, ">", nucmer_out_filter])) utils.syscall(" ".join(["show-coords", "-dTlro", nucmer_out_filter, ">", nucmer_out_coords])) # load hits into hash. key=ref_name, value=another hash with key=qry_name, value=list of hit positions in that ref seq nucmer_hits = {} contigs_to_print = {}
def test_finding_dnaA_in_various_positions(self): tests = [ #dnaa at start - return identical sequence [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join( data_dir, "BREAKFINDER_input_dnaa_at_start.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=False, ), 'BREAKFINDER_output_dnaa_at_start.fa' ], # dnaa in the middle [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join( data_dir, "BREAKFINDER_input_dnaa_in_middle.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=False, ), 'BREAKFINDER_output_dnaa_in_middle.fa' ], # dnaa in middle of contig but revcom [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join( data_dir, "BREAKFINDER_input_dnaa_in_middle_revcom.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=False, ), 'BREAKFINDER_output_dnaa_in_middle.fa' ], # dnaa at the end [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join( data_dir, "BREAKFINDER_input_dnaa_at_end.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=False, ), 'BREAKFINDER_output_dnaa_at_end.fa' ], # dnaa split across start and end, but chunks large enough for promer to detect [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join(data_dir, "BREAKFINDER_input_dnaa_split.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=False, ), 'BREAKFINDER_output_dnaa_split.fa' ], # dnaa split across start and end but revcom [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join( data_dir, "BREAKFINDER_input_dnaa_split_revcom.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=False, ), 'BREAKFINDER_output_dnaa_split_revcom.fa' ], #no dnaa [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join(data_dir, "BREAKFINDER_input_no_dnaa.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=False, ), 'BREAKFINDER_input_no_dnaa.fa' ], #do not change the contig # best dnaa hit not first [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join( data_dir, "BREAKFINDER_input_multiple_dnaa.fa"), gene_file=os.path.join( data_dir, "BREAKFINDER_test_multiple_dnaA.fa"), choose_random_gene=False, rename=False, ), 'BREAKFINDER_output_multiple_dnaa.fa' ], # dnaa split across edges - just 4 bases of dnaa at the end, contig long enough to run promer on just the edges stuck together [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join( data_dir, "BREAKFINDER_input_dnaa_split_edge.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=False, ), 'BREAKFINDER_output_dnaa_split_edge.fa' ], # dnaa split across edges and rev com- just 4 bases of dnaa at the end, contig long enough to run promer on just the edges stuck together [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join( data_dir, "BREAKFINDER_input_dnaa_split_edge_revcom.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=False, ), 'BREAKFINDER_output_dnaa_split_edge.fa' ], # dnaa split across edges (just 5 bases of dnaa at the start) but contig not long enough to run promer on just ends - will not find dnaA [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join( data_dir, "BREAKFINDER_input_dnaa_split_edge_tooshort.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=False, ), 'BREAKFINDER_input_dnaa_split_edge_tooshort.fa' ], # ---- testing other options ----------- #rename genes [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join( data_dir, "BREAKFINDER_input_dnaa_at_start.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=True, ), 'BREAKFINDER_output_dnaa_at_start.fa' ], # #no dnaa, but use prodigal # [contig_break_finder.ContigBreakFinder(fasta_file = os.path.join(data_dir, "BREAKFINDER_input_no_dnaa_use_prodigal.fa"), # gene_file = os.path.join(data_dir, "BREAKFINDER_real_dnaa.fa"), # choose_random_gene=True, # rename = False, # ), # 'BREAKFINDER_output_no_dnaa_use_prodigal.fa' ], #skip one contig [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join( data_dir, "BREAKFINDER_input_multiple_contigs.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=False, skip=os.path.join(data_dir, "BREAKFINDER_skip_one_id.txt")), 'BREAKFINDER_output_skip_contig.fa' ], #skip all contigs [ contig_break_finder.ContigBreakFinder( fasta_file=os.path.join( data_dir, "BREAKFINDER_input_multiple_contigs.fa"), gene_file=os.path.join(data_dir, "BREAKFINDER_test_dnaA.fa"), choose_random_gene=False, rename=True, skip=os.path.join(data_dir, "BREAKFINDER_skip_all.txt")), 'BREAKFINDER_input_multiple_contigs.fa' ], # do not change anything ] for t in tests: t[0].run() self.assertTrue(os.path.isfile(t[0].output_file)) self.assertTrue(os.path.isfile(t[0].summary_file)) expected_contigs = {} tasks.file_to_dict(os.path.join(data_dir, t[1]), expected_contigs) for id in expected_contigs.keys(): self.assertTrue( expected_contigs[id].seq == t[0].contigs[id].seq) os.remove(t[0].output_file) os.remove(t[0].summary_file)