예제 #1
0
	def test_dots_in_filename(self):
		k = KmcComplex(os.getcwd(), 1, 1, [SampleData('a.b#c_1.fastq', 'a.b#c_2.fastq'), SampleData('a.b.c.d_1.fastq', 'a.b.c.d_2.fastq')], [SampleData('a.b.c.d.e_1.fastq', 'a.b.c.d.e_2.fastq')], 'intersection', False)
		k.create_config_files()
		
		with open(os.path.join(k.temp_working_dir, 'traits_config_file'), 'r') as actual_file:
			actual_config_content = actual_file.read()
			self.assertEqual(actual_config_content, """\
INPUT:
a_b_c = kmc_a.b#c
a_b_c_d = kmc_a.b.c.d
a_b_c_d_e = kmc_a.b.c.d.e
OUTPUT:
traits = a_b_c*a_b_c_d
OUTPUT_PARAMS:
-ci1
""") 
		with open(os.path.join(k.temp_working_dir, 'nontraits_config_file'), 'r') as actual_file:
			actual_config_content = actual_file.read()
			self.assertEqual(actual_config_content, """\
INPUT:
a_b_c = kmc_a.b#c
a_b_c_d = kmc_a.b.c.d
a_b_c_d_e = kmc_a.b.c.d.e
OUTPUT:
nontraits = a_b_c_d_e
OUTPUT_PARAMS:
-ci1
""")
예제 #2
0
    def test_sample_definition_line(self):
        k = KmcComplex(os.getcwd(), 1, 1, [], [], 'union')
        s = SampleData('/path/to/sample#ABC_1.fastq',
                       '/path/to/sample#ABC_2.fastq')

        self.assertEqual(k.sample_definition_line(s),
                         'sample_ABC = kmc_sample#ABC')
        k.cleanup()
예제 #3
0
	def test_sample_definitions_str(self):
		k = KmcComplex(os.getcwd(), 1, 1, [SampleData('a_1.fastq', 'a_2.fastq'), SampleData('b_1.fastq', 'b_2.fastq')], [SampleData('c_1.fastq', 'c_2.fastq')], 'union', False)
		expected_output = """\
a = kmc_a
b = kmc_b
c = kmc_c
"""
		self.assertEqual(k.sample_definitions_str(),expected_output)
		k.cleanup()
예제 #4
0
	def run(self):
		self.logger.warning('Using KMC syntax version %s', self.kmc_major_version)
		os.makedirs(self.output_directory)
		trait_samples = SpreadsheetParser(self.file_of_traits, self.verbose).extract_samples()
		nontrait_samples = SpreadsheetParser(self.file_of_nontraits, self.verbose).extract_samples()

		self.logger.warning('Generating kmer databases for all samples')
		kmc_samples = self.generate_kmer_databases(trait_samples, nontrait_samples)
		
		self.logger.warning("Generating a database of kmers which are in the traits but not in the nontraits set")
		kmc_complex = KmcComplex(self.output_directory, self.threads, self.min_kmers_threshold, trait_samples, nontrait_samples, self.action, self.verbose)
		kmc_complex.run()

		kmc_filters = self.filter_data_against_kmers(trait_samples,kmc_complex.result_database())
		
		self.logger.warning('Assembling all of the trait samples')
		spades_assemblies = self.assemble_samples(trait_samples, self.keep_files)
	
		if self.kmer_plot:
			spades_assembly_files = [s.filtered_spades_assembly_file() for s in spades_assemblies if os.path.exists(s.filtered_spades_assembly_file())]
			plot_kmers = PlotKmers( spades_assembly_files,
									self.output_directory,
									self.threads,
									self.kmer,
									self.max_kmers_threshold, 
									self.verbose, 
									self.plot_filename)
			plot_kmers.generate_plot()
			
		method_file = Methods(
						os.path.join(self.output_directory, 'methods_summary.txt'), 
						trait_samples, 
						nontrait_samples, 
						self.min_kmers_threshold, 
						self.min_contig_len, 
						self.start_time, 
						self.spades_exec, 
						self.verbose)
		method_file.create_file()
		self.cleanup(kmc_samples, kmc_complex, kmc_filters)
예제 #5
0
 def test_samples_to_set_operation_str(self):
     k = KmcComplex(os.getcwd(), 1, 1, [
         SampleData('a_1.fastq', 'a_2.fastq'),
         SampleData('b_1.fastq', 'b_2.fastq')
     ], [SampleData('c_1.fastq', 'c_2.fastq')], 'union')
     self.assertEqual(k.trait_samples_to_set_operation_str(),
                      'traits = a+b')
     self.assertEqual(k.nontrait_samples_to_set_operation_str(),
                      'nontraits = c')
     k.cleanup()
예제 #6
0
    def test_create_config_file_intersection(self):
        k = KmcComplex(os.getcwd(), 1, 1, [
            SampleData('a_1.fastq', 'a_2.fastq'),
            SampleData('b_1.fastq', 'b_2.fastq')
        ], [SampleData('c_1.fastq', 'c_2.fastq')], 'intersection')
        k.create_config_files()

        with open(os.path.join(k.temp_working_dir, 'traits_config_file'),
                  'r') as actual_file:
            actual_config_content = actual_file.read()
            self.assertEqual(
                actual_config_content, """\
INPUT:
a = kmc_a
b = kmc_b
c = kmc_c
OUTPUT:
traits = a*b
OUTPUT_PARAMS:
-ci1
""")
        with open(os.path.join(k.temp_working_dir, 'nontraits_config_file'),
                  'r') as actual_file:
            actual_config_content = actual_file.read()
            self.assertEqual(
                actual_config_content, """\
INPUT:
a = kmc_a
b = kmc_b
c = kmc_c
OUTPUT:
nontraits = c
OUTPUT_PARAMS:
-ci1
""")
        with open(os.path.join(k.temp_working_dir, 'combined_config_file'),
                  'r') as actual_file:
            actual_config_content = actual_file.read()
            self.assertEqual(
                actual_config_content, """\
INPUT:
set1 = traits
set2 = nontraits
OUTPUT:
result = set1-set2
OUTPUT_PARAMS:
-ci1
""")

        k.cleanup()
예제 #7
0
    def run(self):
        os.makedirs(self.output_directory)
        trait_samples = SpreadsheetParser(
            self.file_of_trait_fastqs).extract_samples()
        nontrait_samples = SpreadsheetParser(
            self.file_of_nontrait_fastqs).extract_samples()

        self.logger.info("Generating a kmer database for each sample")
        kmc_samples = []
        for set_of_samples in [trait_samples, nontrait_samples]:
            for sample in set_of_samples:
                kmc_sample = Kmc(self.output_directory, sample, self.threads,
                                 self.kmer, self.min_kmers_threshold,
                                 self.max_kmers_threshold)
                kmc_sample.run()
                kmc_samples.append(kmc_sample)

        self.logger.info(
            "Generating a database of kmers which are in the traits but not in the nontraits set"
        )
        kmc_complex = KmcComplex(self.output_directory, self.threads,
                                 self.min_kmers_threshold, trait_samples,
                                 nontrait_samples, self.action)
        kmc_complex.run()

        kmc_filters = []
        for sample in trait_samples:
            kmc_filter = KmcFilter(sample, self.output_directory, self.threads,
                                   kmc_complex.result_database())
            kmc_filter.filter_fastq_file_against_kmers()
            kmc_filters.append(kmc_filter)

        kmc_fastas = []
        spades_assemblies = []
        for sample in trait_samples:
            self.logger.info("First assembly with reads only matching kmers")
            spades_assembly = SpadesAssembly(sample, self.output_directory,
                                             self.threads, self.kmer,
                                             self.spades_exec,
                                             self.min_contig_len, True,
                                             self.min_spades_contig_coverage,
                                             False)
            spades_assembly.run()

            if os.path.getsize(spades_assembly.filtered_spades_assembly_file()
                               ) <= self.min_contig_len:
                self.logger.info(
                    "Theres not enough data in the first assembly after filtering, so skipping the rest of the steps for this sample."
                )
                continue

            self.logger.info("Rescaffold 1st assembly with all reads")
            # Next we want to scaffold by using all of the original reads to join up the small contigs.
            # Extract all of the kmers found in the filtered assembly
            self.logger.info("Extract kmers from assembly")
            kmc_fasta = KmcFasta(
                self.output_directory,
                spades_assembly.filtered_spades_assembly_file(), self.threads,
                self.kmer, 1, self.max_kmers_threshold)
            kmc_fasta.run()
            kmc_fastas.append(kmc_fasta)

            # Pull out any reads matching the kmers found in the assembly
            self.logger.info(
                "Pull out reads from original fastq files matching assembly kmers"
            )
            kmc_filter = KmcFilter(sample, self.output_directory, self.threads,
                                   kmc_fasta.output_database_name())
            kmc_filter.filter_fastq_file_against_kmers()
            kmc_filters.append(kmc_filter)

            # delete the original assembly directory
            if not self.verbose:
                spades_assembly.cleanup()

            self.logger.info("Reassemble with SPAdes")
            final_spades_assembly = SpadesAssembly(
                sample, self.output_directory, self.threads, self.kmer,
                self.spades_exec, self.min_contig_len, False,
                self.min_spades_contig_coverage, True)
            final_spades_assembly.run()
            spades_assemblies.append(final_spades_assembly)
            print(final_spades_assembly.filtered_spades_assembly_file() + "\n")

        method_file = Methods(
            os.path.join(self.output_directory, 'methods_summary.txt'),
            trait_samples, nontrait_samples, self.min_kmers_threshold,
            self.min_contig_len, self.start_time, self.spades_exec)
        method_file.create_file()
        self.cleanup(kmc_samples, kmc_fastas, kmc_complex, kmc_filters,
                     spades_assemblies)
예제 #8
0
	def test_kmc_complex_command(self):
		k = KmcComplex(os.getcwd(), 1, 1, [], [], 'union', False)
		self.assertEqual(k.kmc_complex_command('complex_config_file'), 'kmc_tools -t1 complex complex_config_file > /dev/null 2>&1')
		k.cleanup()