Exemplo n.º 1
0
 def make_spreadsheet_for_ena_download(self):
     sheet = Spreadsheet()
     sheet.supplier, sheet.organisation, sheet.contact, sheet.technology, sheet.name, sheet.accession, sheet.size, \
     sheet.limit = ('Supplier', 'Org', 'Contact', 'Illumina', 'AStudyName1', None, 1.90, '01/01/2025')
     sheet.reads = [RawRead(forward_read='PAIR1', reverse_read='T', sample_name='SAMPLE1',
                            taxon_id='1280', library_name='LIB1', sample_accession=None),
                    RawRead(forward_read='Pair2.fastq.gz', reverse_read='F', sample_name='SAMPLE2',
                            taxon_id='1280', library_name='LIB2', sample_accession=None)]
     return sheet
Exemplo n.º 2
0
 def test_copy_files_single_strand(self, copyfile_patch):
     under_test = Preparation.new_instance(Spreadsheet.new_instance("MyStudy", [
         RawRead(forward_read='PAIR1_1.fastq.gz', reverse_read='PAIR1_2.fastq.gz', sample_name='SAMPLE1',
                 taxon_id='1280', library_name='LIB1', sample_accession=None),
         RawRead(forward_read='SINGLE.fastq.gz', reverse_read=None, sample_name='SAMPLE2',
                 taxon_id='1280', library_name='LIB2', sample_accession=None)]), 'destination', 0, 0)
     under_test.copy_files('source')
     self.assertEquals(copyfile_patch.call_args_list,
                       [call('source/PAIR1_1.fastq.gz', 'destination/0/PAIR1_1.fastq.gz'),
                        call('source/PAIR1_2.fastq.gz', 'destination/0/PAIR1_2.fastq.gz'),
                        call('source/SINGLE.fastq.gz', 'destination/0/SINGLE.fastq.gz')])
Exemplo n.º 3
0
 def test_T_or_F_is_valid(self):
     self.assertEqual([],
                      check_double_ended_column_is_T_or_F(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='T',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1'),
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='F',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Exemplo n.º 4
0
 def test_uniqueness_of_files_sample_and_library_ENA_download(self):
     self.assertEqual([],
                      validate_uniqueness_of_reads(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1',
                                      reverse_read='T',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1'),
                              RawRead(sample_accession=None,
                                      forward_read='PAIR2',
                                      reverse_read='F',
                                      sample_name='SAMPLE2',
                                      taxon_id="1280",
                                      library_name='LIB2')
                          ])))
Exemplo n.º 5
0
 def test_forward_read_not_unique(self):
     self.assertEqual(["Forward read is not unique: PAIR1_1.fastq.gz"],
                      validate_uniqueness_of_reads(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='PAIR1_2.fastq.gz',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1'),
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='PAIR2_2.fastq.gz',
                                      sample_name='SAMPLE2',
                                      taxon_id="1280",
                                      library_name='LIB2')
                          ])))
Exemplo n.º 6
0
 def test_pair_naming_convention_is_valid_for_single_read(self):
     self.assertEqual([],
                      validate_pair_naming_convention(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read=None,
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Exemplo n.º 7
0
 def test_mandatory_fields_for_reads_are_populated_single_read(self):
     self.assertEqual([],
                      validate_mandatory_read_fields(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='READ.fastq.gz',
                                      reverse_read=None,
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Exemplo n.º 8
0
 def test_no_hyphen_in_filename(self):
     self.assertEqual([],
                      validate_no_hyphen_in_filename(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='PAIR1_2.fastq.gz',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Exemplo n.º 9
0
 def test_forward_read_not_populated(self):
     with self.assertRaises(Exception):
         validate_mandatory_read_fields(
             Spreadsheet.new_instance("1234567890123456", [
                 RawRead(sample_accession=None,
                         forward_read=None,
                         reverse_read=None,
                         sample_name='SAMPLE1',
                         taxon_id="1280",
                         library_name='LIB1')
             ]))
Exemplo n.º 10
0
 def test_sample_name_with_valid_char_should_pass_validation(self):
     self.assertEqual([],
                      validate_sample_names(
                          Spreadsheet.new_instance("ValidName12345__", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='PAIR1_2.fastq.gz',
                                      sample_name="SAMPLE_1",
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Exemplo n.º 11
0
 def test_valid_taxon_id(self):
     self.assertEqual([],
                      validate_taxon_ids(
                          Spreadsheet.new_instance("ValidName12345__", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.fastq.gz',
                                      reverse_read='PAIR1_2.fastq.gz',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Exemplo n.º 12
0
 def test_single_read_is_compressed(self):
     self.assertEqual([],
                      validate_files_are_compressed(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1.fastq.gz',
                                      reverse_read=None,
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Exemplo n.º 13
0
 def test_none_is_not_valid(self):
     self.assertEqual(
         ["Double-ended is incorrectly formatted, must be T or F"],
         check_double_ended_column_is_T_or_F(
             Spreadsheet.new_instance("1234567890123456", [
                 RawRead(sample_accession=None,
                         forward_read='PAIR1_1.fastq.gz',
                         reverse_read=None,
                         sample_name='SAMPLE1',
                         taxon_id="1280",
                         library_name='LIB1')
             ])))
Exemplo n.º 14
0
 def _raw_read(self,
               forward_read,
               reverse_read,
               sample_name,
               library_name,
               accession,
               taxon_id='1280'):
     return RawRead(forward_read=forward_read,
                    reverse_read=reverse_read,
                    sample_name=sample_name,
                    sample_accession=accession,
                    taxon_id=taxon_id,
                    library_name=library_name)
Exemplo n.º 15
0
 def test_sample_name_with_invalid_char_should_fail_validation(self):
     self.assertEqual(
         34,
         len(
             validate_sample_names(
                 Spreadsheet.new_instance("ValidName12345__", [
                     RawRead(
                         sample_accession=None,
                         forward_read='PAIR1_1.fastq.gz',
                         reverse_read='PAIR1_2.fastq.gz',
                         sample_name="!\"£$%^&*()+={}[]:@~;'#?/>.<,|\\`¬\t ",
                         taxon_id="1280",
                         library_name='LIB1')
                 ]))))
Exemplo n.º 16
0
 def test_library_name_not_populated(self):
     self.assertEqual([
         "Missing library name for RawRead(forward_read='READ.fastq.gz', reverse_read=None, "
         "sample_name='SAMPLE1', sample_accession=None, taxon_id='1280', library_name=None)"
     ],
                      validate_mandatory_read_fields(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='READ.fastq.gz',
                                      reverse_read=None,
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name=None)
                          ])))
Exemplo n.º 17
0
 def test_path_in_filename_is_invalid(self):
     self.assertEqual(
         [
             "Path present in filename: /some/path/PAIR1_1.fastq.gz",
             "Path present in filename: /some/path/PAIR1_2.fastq.gz",
         ],
         validate_no_path_in_filename(
             Spreadsheet.new_instance("1234567890123456", [
                 RawRead(sample_accession=None,
                         forward_read='/some/path/PAIR1_1.fastq.gz',
                         reverse_read='/some/path/PAIR1_2.fastq.gz',
                         sample_name='SAMPLE1',
                         taxon_id="1280",
                         library_name='LIB1')
             ])))
Exemplo n.º 18
0
 def test_invalid_pair_naming_convention(self):
     self.assertEqual([
         "Inconsistent naming convention of forward and reverse reads for RawRead("
         "forward_read='PAIR1xxx_1.fastq.gz', reverse_read='PAIR1_2.fastq.gz', "
         "sample_name='SAMPLE1', sample_accession=None, taxon_id='1280', library_name='LIB1')"
     ],
                      validate_pair_naming_convention(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1xxx_1.fastq.gz',
                                      reverse_read='PAIR1_2.fastq.gz',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Exemplo n.º 19
0
 def test_reads_are_not_fastq(self):
     self.assertEqual([
         "Forward read file is not correctly formatted for RawRead(forward_read='PAIR1_1.gz', "
         "reverse_read='PAIR1_2.gz', sample_name='SAMPLE1', sample_accession=None, "
         "taxon_id='1280', library_name='LIB1')",
         "Reverse read file is not correctly formatted for RawRead(forward_read='PAIR1_1.gz', "
         "reverse_read='PAIR1_2.gz', sample_name='SAMPLE1', sample_accession=None, "
         "taxon_id='1280', library_name='LIB1')"
     ],
                      validate_files_are_compressed(
                          Spreadsheet.new_instance("1234567890123456", [
                              RawRead(sample_accession=None,
                                      forward_read='PAIR1_1.gz',
                                      reverse_read='PAIR1_2.gz',
                                      sample_name='SAMPLE1',
                                      taxon_id="1280",
                                      library_name='LIB1')
                          ])))
Exemplo n.º 20
0
 def setUp(self):
     self.tempdir = TempDirectory()
     self.tempdir.write('1/Accession1.fastq.gz', b'the text')
     self.tempdir.write('2/Accession1_1.fastq.gz', b'the text')
     self.tempdir.write('2/Accession1_2.fastq.gz',b'the text')
     self.tempdir_path = self.tempdir.path
     print('temp',self.tempdir_path)
     self.under_test1 = Preparation.new_instance(Spreadsheet.new_instance("MyStudy", [
         RawRead(forward_read='Accession1', reverse_read='T', sample_name='SAMPLE1',
                 taxon_id='1280', library_name='LIB1', sample_accession=None),
         RawRead(forward_read='Accession2', reverse_read='T', sample_name='SAMPLE2',
                 taxon_id='1280', library_name='LIB2', sample_accession=None)]), self.tempdir_path, 0, 0)
     self.under_test2 = Preparation.new_instance(Spreadsheet.new_instance("MyStudy", [
         RawRead(forward_read='Accession1', reverse_read='T', sample_name='SAMPLE1',
                 taxon_id='1280', library_name='LIB1', sample_accession=None),
         RawRead(forward_read='Accession2', reverse_read='T', sample_name='SAMPLE2',
                 taxon_id='1280', library_name='LIB2', sample_accession=None)]), self.tempdir_path, 1, 0)
     self.under_test3 = Preparation.new_instance(Spreadsheet.new_instance("MyStudy", [
         RawRead(forward_read='Accession1', reverse_read='T', sample_name='SAMPLE1',
                 taxon_id='1280', library_name='LIB1', sample_accession=None),
         RawRead(forward_read='Accession2', reverse_read='T', sample_name='SAMPLE2',
                 taxon_id='1280', library_name='LIB2', sample_accession=None)]), self.tempdir_path, 2, 0)
Exemplo n.º 21
0
 def load_xlsx(self):
     result = Spreadsheet()
     data_row = 0
     header_row = 0
     for i in range(10):
         if self._sheet.cell(row=i + 1, column=1).value == 'Study Name':
             result.name = self._sheet.cell(row=i + 1, column=2).value
         if self._sheet.cell(row=i + 1, column=1).value == 'Supplier Name':
             result.supplier = self._sheet.cell(row=i + 1, column=2).value
         if self._sheet.cell(row=i + 1, column=1).value == 'Supplier Organisation':
             result.organisation = self._sheet.cell(row=i + 1, column=2).value
         if self._sheet.cell(row=i + 1, column=1).value == 'Sanger Contact Name':
             result.contact = self._sheet.cell(row=i + 1, column=2).value
         if self._sheet.cell(row=i + 1, column=1).value == 'Sequencing Technology':
             result.technology = self._sheet.cell(row=i + 1, column=2).value
         if self._sheet.cell(row=i + 1, column=1).value == 'Study Accession number':
             result.accession = self.__extract_text_value_xlsx(i + 1, 2)
         if self._sheet.cell(row=i + 1, column=1).value == 'Total size of files in GBytes':
             result.size = float(self._sheet.cell(row=i + 1, column=2).value)
         if self._sheet.cell(row=i + 1, column=1).value == 'Data to be kept until':
             result.limit = self._sheet.cell(row=i + 1, column=2).value.strftime('%d/%m/%Y')
         if self._sheet.cell(row=i + 1, column=1).value == 'Filename' or self._sheet.cell(row=i + 1, column=1).value == 'Run Accession':
             data_row = i + 2
             header_row = i + 1
             break
     filename_column = None
     run_accession_column = None
     for i in range(self._sheet.max_column):
         if self._sheet.cell(row=header_row, column=i + 1).value == 'Filename':
             filename_column = i +1
         if self._sheet.cell(row=header_row, column=i + 1).value == 'Run Accession':
             run_accession_column = i +1
         if filename_column is not None:
             if self._sheet.cell(row=header_row, column=i + 1).value == 'Mate File':
                 mate_filename_column = i + 1
         if run_accession_column is not None:
             if self._sheet.cell(row=header_row, column=i + 1).value == 'Double-ended Reads':
                 double_ended_reads_column = i + 1
         if self._sheet.cell(row=header_row, column=i + 1).value == 'Sample Name':
             sample_name_column = i + 1
         if self._sheet.cell(row=header_row, column=i + 1).value == 'Sample Accession number':
             sample_accession_column = i + 1
         if self._sheet.cell(row=header_row, column=i + 1).value == 'Taxon ID':
             taxon_id_column = i + 1
         if self._sheet.cell(row=header_row, column=i + 1).value == 'Library Name':
             library_name_column = i + 1
     reads = []
     for i in range(data_row, self._sheet.max_row+1):
         sample_name = self.__extract_float_value_xlsx(i, sample_name_column)
         library_name = self.__extract_float_value_xlsx(i, library_name_column)
         if library_name is None:
             library_name = sample_name
         if filename_column is not None:
             reads.append(RawRead(
                 self.__extract_text_value_xlsx(i, filename_column),
                 self.__extract_text_value_xlsx(i, mate_filename_column),
                 sample_name,
                 self.__extract_text_value_xlsx(i, sample_accession_column),
                 self.__extract_float_value_xlsx(i, taxon_id_column),
                 library_name))
         if run_accession_column is not None:
             reads.append(RawRead(
                 (self.__extract_text_value_xlsx(i, run_accession_column)),
                 self.__extract_text_value_xlsx(i, double_ended_reads_column),
                 sample_name,
                 self.__extract_text_value_xlsx(i, sample_accession_column),
                 self.__extract_float_value_xlsx(i, taxon_id_column),
                 library_name))
     result.reads = reads
     return result
Exemplo n.º 22
0
 def load_xls(self):
     result = Spreadsheet()
     data_row = 0
     header_row = 0
     for i in range(self._sheet.nrows):
         if self._sheet.cell_value(i, 0) == 'Study Name':
             result.name = self._sheet.cell_value(i, 1)
         if self._sheet.cell_value(i, 0) == 'Supplier Name':
             result.supplier = self._sheet.cell_value(i, 1)
         if self._sheet.cell_value(i, 0) == 'Supplier Organisation':
             result.organisation = self._sheet.cell_value(i, 1)
         if self._sheet.cell_value(i, 0) == 'Sanger Contact Name':
             result.contact = self._sheet.cell_value(i, 1)
         if self._sheet.cell_value(i, 0) == 'Sequencing Technology':
             result.technology = self._sheet.cell_value(i, 1)
         if self._sheet.cell_value(i, 0) == 'Study Accession number':
             result.accession = self.__extract_text_value_xls(i, 1)
         if self._sheet.cell_value(i, 0) == 'Total size of files in GBytes':
             result.size = self._sheet.cell_value(i, 1)
         if self._sheet.cell_value(i, 0) == 'Data to be kept until':
             year, month, day, hour, minute, second = xlrd.xldate_as_tuple(self._sheet.cell_value(i, 1),
                                                                           self._workbook.datemode)
             result.limit = "%02d/%02d/%04d" % (day, month, year)
         if self._sheet.cell_value(i, 0) == 'Filename' or self._sheet.cell_value(i, 0) == 'Run Accession':
             data_row = i + 1
             header_row = i
             break
     filename_column = None
     run_accession_column = None
     for i in range(self._sheet.ncols):
         if self._sheet.cell_value(header_row, i) == 'Filename':
             filename_column = i
         if self._sheet.cell_value(header_row, i) == 'Run Accession':
             run_accession_column = i
         if filename_column is not None:
             if self._sheet.cell_value(header_row, i) == 'Mate File':
                 mate_filename_column = i
         if run_accession_column is not None:
             if self._sheet.cell_value(header_row, i) == 'Double-ended Reads':
                 double_ended_reads_column = i
         if self._sheet.cell_value(header_row, i) == 'Sample Name':
             sample_name_column = i
         if self._sheet.cell_value(header_row, i) == 'Sample Accession number':
             sample_accession_column = i
         if self._sheet.cell_value(header_row, i) == 'Taxon ID':
             taxon_id_column = i
         if self._sheet.cell_value(header_row, i) == 'Library Name':
             library_name_column = i
     reads = []
     for i in range(data_row, self._sheet.nrows):
         sample_name = self.__extract_float_value_xls(i, sample_name_column)
         library_name = self.__extract_float_value_xls(i, library_name_column)
         if library_name is None:
             library_name = sample_name
         if filename_column is not None:
             reads.append(RawRead(
                 self.__extract_text_value_xls(i, filename_column),
                 self.__extract_text_value_xls(i, mate_filename_column),
                 sample_name,
                 self.__extract_text_value_xls(i, sample_accession_column),
                 self.__extract_float_value_xls(i, taxon_id_column),
                 library_name))
         if run_accession_column is not None:
             reads.append(RawRead(
                 (self.__extract_text_value_xls(i, run_accession_column)),
                 self.__extract_text_value_xls(i, double_ended_reads_column),
                 sample_name,
                 self.__extract_text_value_xls(i, sample_accession_column),
                 self.__extract_float_value_xls(i, taxon_id_column),
                 library_name))
     result.reads = reads
     return result