def test_dots_in_filename(self): k = KmcComplex(os.getcwd(), 1, 1, [SampleData('a.b#c_1.fastq', 'a.b#c_2.fastq'), SampleData('a.b.c.d_1.fastq', 'a.b.c.d_2.fastq')], [SampleData('a.b.c.d.e_1.fastq', 'a.b.c.d.e_2.fastq')], 'intersection', False) k.create_config_files() with open(os.path.join(k.temp_working_dir, 'traits_config_file'), 'r') as actual_file: actual_config_content = actual_file.read() self.assertEqual(actual_config_content, """\ INPUT: a_b_c = kmc_a.b#c a_b_c_d = kmc_a.b.c.d a_b_c_d_e = kmc_a.b.c.d.e OUTPUT: traits = a_b_c*a_b_c_d OUTPUT_PARAMS: -ci1 """) with open(os.path.join(k.temp_working_dir, 'nontraits_config_file'), 'r') as actual_file: actual_config_content = actual_file.read() self.assertEqual(actual_config_content, """\ INPUT: a_b_c = kmc_a.b#c a_b_c_d = kmc_a.b.c.d a_b_c_d_e = kmc_a.b.c.d.e OUTPUT: nontraits = a_b_c_d_e OUTPUT_PARAMS: -ci1 """)
def test_sample_definition_line(self): k = KmcComplex(os.getcwd(), 1, 1, [], [], 'union') s = SampleData('/path/to/sample#ABC_1.fastq', '/path/to/sample#ABC_2.fastq') self.assertEqual(k.sample_definition_line(s), 'sample_ABC = kmc_sample#ABC') k.cleanup()
def test_sample_definitions_str(self): k = KmcComplex(os.getcwd(), 1, 1, [SampleData('a_1.fastq', 'a_2.fastq'), SampleData('b_1.fastq', 'b_2.fastq')], [SampleData('c_1.fastq', 'c_2.fastq')], 'union', False) expected_output = """\ a = kmc_a b = kmc_b c = kmc_c """ self.assertEqual(k.sample_definitions_str(),expected_output) k.cleanup()
def run(self): self.logger.warning('Using KMC syntax version %s', self.kmc_major_version) os.makedirs(self.output_directory) trait_samples = SpreadsheetParser(self.file_of_traits, self.verbose).extract_samples() nontrait_samples = SpreadsheetParser(self.file_of_nontraits, self.verbose).extract_samples() self.logger.warning('Generating kmer databases for all samples') kmc_samples = self.generate_kmer_databases(trait_samples, nontrait_samples) self.logger.warning("Generating a database of kmers which are in the traits but not in the nontraits set") kmc_complex = KmcComplex(self.output_directory, self.threads, self.min_kmers_threshold, trait_samples, nontrait_samples, self.action, self.verbose) kmc_complex.run() kmc_filters = self.filter_data_against_kmers(trait_samples,kmc_complex.result_database()) self.logger.warning('Assembling all of the trait samples') spades_assemblies = self.assemble_samples(trait_samples, self.keep_files) if self.kmer_plot: spades_assembly_files = [s.filtered_spades_assembly_file() for s in spades_assemblies if os.path.exists(s.filtered_spades_assembly_file())] plot_kmers = PlotKmers( spades_assembly_files, self.output_directory, self.threads, self.kmer, self.max_kmers_threshold, self.verbose, self.plot_filename) plot_kmers.generate_plot() method_file = Methods( os.path.join(self.output_directory, 'methods_summary.txt'), trait_samples, nontrait_samples, self.min_kmers_threshold, self.min_contig_len, self.start_time, self.spades_exec, self.verbose) method_file.create_file() self.cleanup(kmc_samples, kmc_complex, kmc_filters)
def test_samples_to_set_operation_str(self): k = KmcComplex(os.getcwd(), 1, 1, [ SampleData('a_1.fastq', 'a_2.fastq'), SampleData('b_1.fastq', 'b_2.fastq') ], [SampleData('c_1.fastq', 'c_2.fastq')], 'union') self.assertEqual(k.trait_samples_to_set_operation_str(), 'traits = a+b') self.assertEqual(k.nontrait_samples_to_set_operation_str(), 'nontraits = c') k.cleanup()
def test_create_config_file_intersection(self): k = KmcComplex(os.getcwd(), 1, 1, [ SampleData('a_1.fastq', 'a_2.fastq'), SampleData('b_1.fastq', 'b_2.fastq') ], [SampleData('c_1.fastq', 'c_2.fastq')], 'intersection') k.create_config_files() with open(os.path.join(k.temp_working_dir, 'traits_config_file'), 'r') as actual_file: actual_config_content = actual_file.read() self.assertEqual( actual_config_content, """\ INPUT: a = kmc_a b = kmc_b c = kmc_c OUTPUT: traits = a*b OUTPUT_PARAMS: -ci1 """) with open(os.path.join(k.temp_working_dir, 'nontraits_config_file'), 'r') as actual_file: actual_config_content = actual_file.read() self.assertEqual( actual_config_content, """\ INPUT: a = kmc_a b = kmc_b c = kmc_c OUTPUT: nontraits = c OUTPUT_PARAMS: -ci1 """) with open(os.path.join(k.temp_working_dir, 'combined_config_file'), 'r') as actual_file: actual_config_content = actual_file.read() self.assertEqual( actual_config_content, """\ INPUT: set1 = traits set2 = nontraits OUTPUT: result = set1-set2 OUTPUT_PARAMS: -ci1 """) k.cleanup()
def run(self): os.makedirs(self.output_directory) trait_samples = SpreadsheetParser( self.file_of_trait_fastqs).extract_samples() nontrait_samples = SpreadsheetParser( self.file_of_nontrait_fastqs).extract_samples() self.logger.info("Generating a kmer database for each sample") kmc_samples = [] for set_of_samples in [trait_samples, nontrait_samples]: for sample in set_of_samples: kmc_sample = Kmc(self.output_directory, sample, self.threads, self.kmer, self.min_kmers_threshold, self.max_kmers_threshold) kmc_sample.run() kmc_samples.append(kmc_sample) self.logger.info( "Generating a database of kmers which are in the traits but not in the nontraits set" ) kmc_complex = KmcComplex(self.output_directory, self.threads, self.min_kmers_threshold, trait_samples, nontrait_samples, self.action) kmc_complex.run() kmc_filters = [] for sample in trait_samples: kmc_filter = KmcFilter(sample, self.output_directory, self.threads, kmc_complex.result_database()) kmc_filter.filter_fastq_file_against_kmers() kmc_filters.append(kmc_filter) kmc_fastas = [] spades_assemblies = [] for sample in trait_samples: self.logger.info("First assembly with reads only matching kmers") spades_assembly = SpadesAssembly(sample, self.output_directory, self.threads, self.kmer, self.spades_exec, self.min_contig_len, True, self.min_spades_contig_coverage, False) spades_assembly.run() if os.path.getsize(spades_assembly.filtered_spades_assembly_file() ) <= self.min_contig_len: self.logger.info( "Theres not enough data in the first assembly after filtering, so skipping the rest of the steps for this sample." ) continue self.logger.info("Rescaffold 1st assembly with all reads") # Next we want to scaffold by using all of the original reads to join up the small contigs. # Extract all of the kmers found in the filtered assembly self.logger.info("Extract kmers from assembly") kmc_fasta = KmcFasta( self.output_directory, spades_assembly.filtered_spades_assembly_file(), self.threads, self.kmer, 1, self.max_kmers_threshold) kmc_fasta.run() kmc_fastas.append(kmc_fasta) # Pull out any reads matching the kmers found in the assembly self.logger.info( "Pull out reads from original fastq files matching assembly kmers" ) kmc_filter = KmcFilter(sample, self.output_directory, self.threads, kmc_fasta.output_database_name()) kmc_filter.filter_fastq_file_against_kmers() kmc_filters.append(kmc_filter) # delete the original assembly directory if not self.verbose: spades_assembly.cleanup() self.logger.info("Reassemble with SPAdes") final_spades_assembly = SpadesAssembly( sample, self.output_directory, self.threads, self.kmer, self.spades_exec, self.min_contig_len, False, self.min_spades_contig_coverage, True) final_spades_assembly.run() spades_assemblies.append(final_spades_assembly) print(final_spades_assembly.filtered_spades_assembly_file() + "\n") method_file = Methods( os.path.join(self.output_directory, 'methods_summary.txt'), trait_samples, nontrait_samples, self.min_kmers_threshold, self.min_contig_len, self.start_time, self.spades_exec) method_file.create_file() self.cleanup(kmc_samples, kmc_fastas, kmc_complex, kmc_filters, spades_assemblies)
def test_kmc_complex_command(self): k = KmcComplex(os.getcwd(), 1, 1, [], [], 'union', False) self.assertEqual(k.kmc_complex_command('complex_config_file'), 'kmc_tools -t1 complex complex_config_file > /dev/null 2>&1') k.cleanup()