def make_spreadsheet_for_ena_download(self): sheet = Spreadsheet() sheet.supplier, sheet.organisation, sheet.contact, sheet.technology, sheet.name, sheet.accession, sheet.size, \ sheet.limit = ('Supplier', 'Org', 'Contact', 'Illumina', 'AStudyName1', None, 1.90, '01/01/2025') sheet.reads = [RawRead(forward_read='PAIR1', reverse_read='T', sample_name='SAMPLE1', taxon_id='1280', library_name='LIB1', sample_accession=None), RawRead(forward_read='Pair2.fastq.gz', reverse_read='F', sample_name='SAMPLE2', taxon_id='1280', library_name='LIB2', sample_accession=None)] return sheet
def test_copy_files_single_strand(self, copyfile_patch): under_test = Preparation.new_instance(Spreadsheet.new_instance("MyStudy", [ RawRead(forward_read='PAIR1_1.fastq.gz', reverse_read='PAIR1_2.fastq.gz', sample_name='SAMPLE1', taxon_id='1280', library_name='LIB1', sample_accession=None), RawRead(forward_read='SINGLE.fastq.gz', reverse_read=None, sample_name='SAMPLE2', taxon_id='1280', library_name='LIB2', sample_accession=None)]), 'destination', 0, 0) under_test.copy_files('source') self.assertEquals(copyfile_patch.call_args_list, [call('source/PAIR1_1.fastq.gz', 'destination/0/PAIR1_1.fastq.gz'), call('source/PAIR1_2.fastq.gz', 'destination/0/PAIR1_2.fastq.gz'), call('source/SINGLE.fastq.gz', 'destination/0/SINGLE.fastq.gz')])
def test_T_or_F_is_valid(self): self.assertEqual([], check_double_ended_column_is_T_or_F( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read='PAIR1_1.fastq.gz', reverse_read='T', sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1'), RawRead(sample_accession=None, forward_read='PAIR1_1.fastq.gz', reverse_read='F', sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1') ])))
def test_uniqueness_of_files_sample_and_library_ENA_download(self): self.assertEqual([], validate_uniqueness_of_reads( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read='PAIR1', reverse_read='T', sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1'), RawRead(sample_accession=None, forward_read='PAIR2', reverse_read='F', sample_name='SAMPLE2', taxon_id="1280", library_name='LIB2') ])))
def test_forward_read_not_unique(self): self.assertEqual(["Forward read is not unique: PAIR1_1.fastq.gz"], validate_uniqueness_of_reads( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read='PAIR1_1.fastq.gz', reverse_read='PAIR1_2.fastq.gz', sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1'), RawRead(sample_accession=None, forward_read='PAIR1_1.fastq.gz', reverse_read='PAIR2_2.fastq.gz', sample_name='SAMPLE2', taxon_id="1280", library_name='LIB2') ])))
def test_pair_naming_convention_is_valid_for_single_read(self): self.assertEqual([], validate_pair_naming_convention( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read='PAIR1_1.fastq.gz', reverse_read=None, sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1') ])))
def test_mandatory_fields_for_reads_are_populated_single_read(self): self.assertEqual([], validate_mandatory_read_fields( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read='READ.fastq.gz', reverse_read=None, sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1') ])))
def test_no_hyphen_in_filename(self): self.assertEqual([], validate_no_hyphen_in_filename( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read='PAIR1_1.fastq.gz', reverse_read='PAIR1_2.fastq.gz', sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1') ])))
def test_forward_read_not_populated(self): with self.assertRaises(Exception): validate_mandatory_read_fields( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read=None, reverse_read=None, sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1') ]))
def test_sample_name_with_valid_char_should_pass_validation(self): self.assertEqual([], validate_sample_names( Spreadsheet.new_instance("ValidName12345__", [ RawRead(sample_accession=None, forward_read='PAIR1_1.fastq.gz', reverse_read='PAIR1_2.fastq.gz', sample_name="SAMPLE_1", taxon_id="1280", library_name='LIB1') ])))
def test_valid_taxon_id(self): self.assertEqual([], validate_taxon_ids( Spreadsheet.new_instance("ValidName12345__", [ RawRead(sample_accession=None, forward_read='PAIR1_1.fastq.gz', reverse_read='PAIR1_2.fastq.gz', sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1') ])))
def test_single_read_is_compressed(self): self.assertEqual([], validate_files_are_compressed( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read='PAIR1.fastq.gz', reverse_read=None, sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1') ])))
def test_none_is_not_valid(self): self.assertEqual( ["Double-ended is incorrectly formatted, must be T or F"], check_double_ended_column_is_T_or_F( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read='PAIR1_1.fastq.gz', reverse_read=None, sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1') ])))
def _raw_read(self, forward_read, reverse_read, sample_name, library_name, accession, taxon_id='1280'): return RawRead(forward_read=forward_read, reverse_read=reverse_read, sample_name=sample_name, sample_accession=accession, taxon_id=taxon_id, library_name=library_name)
def test_sample_name_with_invalid_char_should_fail_validation(self): self.assertEqual( 34, len( validate_sample_names( Spreadsheet.new_instance("ValidName12345__", [ RawRead( sample_accession=None, forward_read='PAIR1_1.fastq.gz', reverse_read='PAIR1_2.fastq.gz', sample_name="!\"£$%^&*()+={}[]:@~;'#?/>.<,|\\`¬\t ", taxon_id="1280", library_name='LIB1') ]))))
def test_library_name_not_populated(self): self.assertEqual([ "Missing library name for RawRead(forward_read='READ.fastq.gz', reverse_read=None, " "sample_name='SAMPLE1', sample_accession=None, taxon_id='1280', library_name=None)" ], validate_mandatory_read_fields( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read='READ.fastq.gz', reverse_read=None, sample_name='SAMPLE1', taxon_id="1280", library_name=None) ])))
def test_path_in_filename_is_invalid(self): self.assertEqual( [ "Path present in filename: /some/path/PAIR1_1.fastq.gz", "Path present in filename: /some/path/PAIR1_2.fastq.gz", ], validate_no_path_in_filename( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read='/some/path/PAIR1_1.fastq.gz', reverse_read='/some/path/PAIR1_2.fastq.gz', sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1') ])))
def test_invalid_pair_naming_convention(self): self.assertEqual([ "Inconsistent naming convention of forward and reverse reads for RawRead(" "forward_read='PAIR1xxx_1.fastq.gz', reverse_read='PAIR1_2.fastq.gz', " "sample_name='SAMPLE1', sample_accession=None, taxon_id='1280', library_name='LIB1')" ], validate_pair_naming_convention( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read='PAIR1xxx_1.fastq.gz', reverse_read='PAIR1_2.fastq.gz', sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1') ])))
def test_reads_are_not_fastq(self): self.assertEqual([ "Forward read file is not correctly formatted for RawRead(forward_read='PAIR1_1.gz', " "reverse_read='PAIR1_2.gz', sample_name='SAMPLE1', sample_accession=None, " "taxon_id='1280', library_name='LIB1')", "Reverse read file is not correctly formatted for RawRead(forward_read='PAIR1_1.gz', " "reverse_read='PAIR1_2.gz', sample_name='SAMPLE1', sample_accession=None, " "taxon_id='1280', library_name='LIB1')" ], validate_files_are_compressed( Spreadsheet.new_instance("1234567890123456", [ RawRead(sample_accession=None, forward_read='PAIR1_1.gz', reverse_read='PAIR1_2.gz', sample_name='SAMPLE1', taxon_id="1280", library_name='LIB1') ])))
def setUp(self): self.tempdir = TempDirectory() self.tempdir.write('1/Accession1.fastq.gz', b'the text') self.tempdir.write('2/Accession1_1.fastq.gz', b'the text') self.tempdir.write('2/Accession1_2.fastq.gz',b'the text') self.tempdir_path = self.tempdir.path print('temp',self.tempdir_path) self.under_test1 = Preparation.new_instance(Spreadsheet.new_instance("MyStudy", [ RawRead(forward_read='Accession1', reverse_read='T', sample_name='SAMPLE1', taxon_id='1280', library_name='LIB1', sample_accession=None), RawRead(forward_read='Accession2', reverse_read='T', sample_name='SAMPLE2', taxon_id='1280', library_name='LIB2', sample_accession=None)]), self.tempdir_path, 0, 0) self.under_test2 = Preparation.new_instance(Spreadsheet.new_instance("MyStudy", [ RawRead(forward_read='Accession1', reverse_read='T', sample_name='SAMPLE1', taxon_id='1280', library_name='LIB1', sample_accession=None), RawRead(forward_read='Accession2', reverse_read='T', sample_name='SAMPLE2', taxon_id='1280', library_name='LIB2', sample_accession=None)]), self.tempdir_path, 1, 0) self.under_test3 = Preparation.new_instance(Spreadsheet.new_instance("MyStudy", [ RawRead(forward_read='Accession1', reverse_read='T', sample_name='SAMPLE1', taxon_id='1280', library_name='LIB1', sample_accession=None), RawRead(forward_read='Accession2', reverse_read='T', sample_name='SAMPLE2', taxon_id='1280', library_name='LIB2', sample_accession=None)]), self.tempdir_path, 2, 0)
def load_xlsx(self): result = Spreadsheet() data_row = 0 header_row = 0 for i in range(10): if self._sheet.cell(row=i + 1, column=1).value == 'Study Name': result.name = self._sheet.cell(row=i + 1, column=2).value if self._sheet.cell(row=i + 1, column=1).value == 'Supplier Name': result.supplier = self._sheet.cell(row=i + 1, column=2).value if self._sheet.cell(row=i + 1, column=1).value == 'Supplier Organisation': result.organisation = self._sheet.cell(row=i + 1, column=2).value if self._sheet.cell(row=i + 1, column=1).value == 'Sanger Contact Name': result.contact = self._sheet.cell(row=i + 1, column=2).value if self._sheet.cell(row=i + 1, column=1).value == 'Sequencing Technology': result.technology = self._sheet.cell(row=i + 1, column=2).value if self._sheet.cell(row=i + 1, column=1).value == 'Study Accession number': result.accession = self.__extract_text_value_xlsx(i + 1, 2) if self._sheet.cell(row=i + 1, column=1).value == 'Total size of files in GBytes': result.size = float(self._sheet.cell(row=i + 1, column=2).value) if self._sheet.cell(row=i + 1, column=1).value == 'Data to be kept until': result.limit = self._sheet.cell(row=i + 1, column=2).value.strftime('%d/%m/%Y') if self._sheet.cell(row=i + 1, column=1).value == 'Filename' or self._sheet.cell(row=i + 1, column=1).value == 'Run Accession': data_row = i + 2 header_row = i + 1 break filename_column = None run_accession_column = None for i in range(self._sheet.max_column): if self._sheet.cell(row=header_row, column=i + 1).value == 'Filename': filename_column = i +1 if self._sheet.cell(row=header_row, column=i + 1).value == 'Run Accession': run_accession_column = i +1 if filename_column is not None: if self._sheet.cell(row=header_row, column=i + 1).value == 'Mate File': mate_filename_column = i + 1 if run_accession_column is not None: if self._sheet.cell(row=header_row, column=i + 1).value == 'Double-ended Reads': double_ended_reads_column = i + 1 if self._sheet.cell(row=header_row, column=i + 1).value == 'Sample Name': sample_name_column = i + 1 if self._sheet.cell(row=header_row, column=i + 1).value == 'Sample Accession number': sample_accession_column = i + 1 if self._sheet.cell(row=header_row, column=i + 1).value == 'Taxon ID': taxon_id_column = i + 1 if self._sheet.cell(row=header_row, column=i + 1).value == 'Library Name': library_name_column = i + 1 reads = [] for i in range(data_row, self._sheet.max_row+1): sample_name = self.__extract_float_value_xlsx(i, sample_name_column) library_name = self.__extract_float_value_xlsx(i, library_name_column) if library_name is None: library_name = sample_name if filename_column is not None: reads.append(RawRead( self.__extract_text_value_xlsx(i, filename_column), self.__extract_text_value_xlsx(i, mate_filename_column), sample_name, self.__extract_text_value_xlsx(i, sample_accession_column), self.__extract_float_value_xlsx(i, taxon_id_column), library_name)) if run_accession_column is not None: reads.append(RawRead( (self.__extract_text_value_xlsx(i, run_accession_column)), self.__extract_text_value_xlsx(i, double_ended_reads_column), sample_name, self.__extract_text_value_xlsx(i, sample_accession_column), self.__extract_float_value_xlsx(i, taxon_id_column), library_name)) result.reads = reads return result
def load_xls(self): result = Spreadsheet() data_row = 0 header_row = 0 for i in range(self._sheet.nrows): if self._sheet.cell_value(i, 0) == 'Study Name': result.name = self._sheet.cell_value(i, 1) if self._sheet.cell_value(i, 0) == 'Supplier Name': result.supplier = self._sheet.cell_value(i, 1) if self._sheet.cell_value(i, 0) == 'Supplier Organisation': result.organisation = self._sheet.cell_value(i, 1) if self._sheet.cell_value(i, 0) == 'Sanger Contact Name': result.contact = self._sheet.cell_value(i, 1) if self._sheet.cell_value(i, 0) == 'Sequencing Technology': result.technology = self._sheet.cell_value(i, 1) if self._sheet.cell_value(i, 0) == 'Study Accession number': result.accession = self.__extract_text_value_xls(i, 1) if self._sheet.cell_value(i, 0) == 'Total size of files in GBytes': result.size = self._sheet.cell_value(i, 1) if self._sheet.cell_value(i, 0) == 'Data to be kept until': year, month, day, hour, minute, second = xlrd.xldate_as_tuple(self._sheet.cell_value(i, 1), self._workbook.datemode) result.limit = "%02d/%02d/%04d" % (day, month, year) if self._sheet.cell_value(i, 0) == 'Filename' or self._sheet.cell_value(i, 0) == 'Run Accession': data_row = i + 1 header_row = i break filename_column = None run_accession_column = None for i in range(self._sheet.ncols): if self._sheet.cell_value(header_row, i) == 'Filename': filename_column = i if self._sheet.cell_value(header_row, i) == 'Run Accession': run_accession_column = i if filename_column is not None: if self._sheet.cell_value(header_row, i) == 'Mate File': mate_filename_column = i if run_accession_column is not None: if self._sheet.cell_value(header_row, i) == 'Double-ended Reads': double_ended_reads_column = i if self._sheet.cell_value(header_row, i) == 'Sample Name': sample_name_column = i if self._sheet.cell_value(header_row, i) == 'Sample Accession number': sample_accession_column = i if self._sheet.cell_value(header_row, i) == 'Taxon ID': taxon_id_column = i if self._sheet.cell_value(header_row, i) == 'Library Name': library_name_column = i reads = [] for i in range(data_row, self._sheet.nrows): sample_name = self.__extract_float_value_xls(i, sample_name_column) library_name = self.__extract_float_value_xls(i, library_name_column) if library_name is None: library_name = sample_name if filename_column is not None: reads.append(RawRead( self.__extract_text_value_xls(i, filename_column), self.__extract_text_value_xls(i, mate_filename_column), sample_name, self.__extract_text_value_xls(i, sample_accession_column), self.__extract_float_value_xls(i, taxon_id_column), library_name)) if run_accession_column is not None: reads.append(RawRead( (self.__extract_text_value_xls(i, run_accession_column)), self.__extract_text_value_xls(i, double_ended_reads_column), sample_name, self.__extract_text_value_xls(i, sample_accession_column), self.__extract_float_value_xls(i, taxon_id_column), library_name)) result.reads = reads return result