def test_to_str(self): params = PreprocessedIlluminaParams(1) obs = params.to_str() exp = ("--barcode_type golay_12 --max_bad_run_length 3 " "--max_barcode_errors 1.5 --min_per_read_length_fraction 0.75 " "--phred_quality_threshold 3 --sequence_max_n 0") self.assertEqual(obs, exp)
def test_create_duplicate(self): with self.assertRaises(QiitaDBDuplicateError): PreprocessedIlluminaParams.create( "test_error", max_bad_run_length=3, min_per_read_length_fraction=0.75, sequence_max_n=0, rev_comp_barcode=False, rev_comp_mapping_barcodes=False, rev_comp=False, phred_quality_threshold=3, barcode_type="golay_12", max_barcode_errors=1.5)
def test_insert_preprocessed_data(self): study = Study(1) params = PreprocessedIlluminaParams(1) prep_template = PrepTemplate(1) prep_out_dir = mkdtemp() self.dirs_to_remove.append(prep_out_dir) path_builder = partial(join, prep_out_dir) db_path_builder = partial(join, join(self.db_dir, "preprocessed_data")) file_suffixes = [ 'seqs.fna', 'seqs.fastq', 'seqs.demux', 'split_library_log.txt' ] db_files = [] for f_suff in file_suffixes: fp = path_builder(f_suff) with open(fp, 'w') as f: f.write("\n") self.files_to_remove.append(fp) db_files.append(db_path_builder("3_%s" % f_suff)) self.files_to_remove.extend(db_files) _insert_preprocessed_data(study, params, prep_template, prep_out_dir) # Check that the files have been copied for fp in db_files: self.assertTrue(exists(fp)) # Check that a new preprocessed data has been created self.assertTrue( self.conn_handler.execute_fetchone( "SELECT EXISTS(SELECT * FROM qiita.preprocessed_data WHERE " "preprocessed_data_id=%s)", (3, ))[0])
def test_get_preprocess_fastq_cmd(self): raw_data = RawData(1) params = [p for p in list(PreprocessedIlluminaParams.iter()) if p.name == 'per sample FASTQ defaults'][0] prep_template = PrepTemplate(1) obs_cmd, obs_output_dir = _get_preprocess_fastq_cmd( raw_data, prep_template, params) get_raw_path = partial(join, self.db_dir, 'raw_data') seqs_fp = get_raw_path('1_s_G1_L001_sequences.fastq.gz') bc_fp = get_raw_path('1_s_G1_L001_sequences_barcodes.fastq.gz') exp_cmd_1 = ("split_libraries_fastq.py --store_demultiplexed_fastq -i " "{} -b {} " "-m ".format(seqs_fp, bc_fp)) exp_cmd_2 = ( "-o {0} --barcode_type not-barcoded --max_bad_run_length 3 " "--max_barcode_errors 1.5 --min_per_read_length_fraction 0.75 " "--phred_quality_threshold 3 --sequence_max_n 0".format( obs_output_dir)) # We are splitting the command into two parts because there is no way # that we can know the filepath of the mapping file. We thus split the # command on the mapping file path and we check that the two parts # of the commands is correct obs_cmd_1 = obs_cmd[:len(exp_cmd_1)] obs_cmd_2 = obs_cmd[len(exp_cmd_1):].split(" ", 1)[1] self.assertEqual(obs_cmd_1, exp_cmd_1) self.assertEqual(obs_cmd_2, exp_cmd_2)
def test_get_preprocess_fastq_cmd(self): raw_data = RawData(1) params = PreprocessedIlluminaParams(1) prep_template = PrepTemplate(1) obs_cmd, obs_output_dir = _get_preprocess_fastq_cmd( raw_data, prep_template, params) get_raw_path = partial(join, self.db_dir, 'raw_data') seqs_fp = get_raw_path('1_s_G1_L001_sequences.fastq.gz') bc_fp = get_raw_path('1_s_G1_L001_sequences_barcodes.fastq.gz') exp_cmd_1 = ("split_libraries_fastq.py --store_demultiplexed_fastq -i " "{} -b {} " "-m ".format(seqs_fp, bc_fp)) exp_cmd_2 = ("-o {0} --barcode_type golay_12 --max_bad_run_length 3 " "--max_barcode_errors 1.5 " "--min_per_read_length_fraction 0.75 " "--phred_quality_threshold 3 " "--sequence_max_n 0".format(obs_output_dir)) # We are splitting the command into two parts because there is no way # that we can know the filepath of the mapping file. We thus split the # command on the mapping file path and we check that the two parts # of the commands is correct obs_cmd_1 = obs_cmd[:len(exp_cmd_1)] obs_cmd_2 = obs_cmd[len(exp_cmd_1):].split(" ", 1)[1] self.assertEqual(obs_cmd_1, exp_cmd_1) self.assertEqual(obs_cmd_2, exp_cmd_2)
def test_get_preprocess_fastq_cmd_per_sample_FASTQ_failure(self): metadata_dict = { 'SKB8.640193': {'run_prefix': "sample1_failure", 'primer': 'A', 'barcode': 'A', 'center_name': 'ANL', 'platform': 'ILLUMINA', 'library_construction_protocol': 'A', 'experiment_design_description': 'A'}} md_template = pd.DataFrame.from_dict(metadata_dict, orient='index') prep_template = PrepTemplate.create(md_template, Study(1), '16S') # This part should fail fp1 = self.path_builder('sample1_failure.fastq') with open(fp1, 'w') as f: f.write('\n') self.files_to_remove.append(fp1) fp2 = self.path_builder('sample1_failure.barcodes.fastq.gz') with open(fp2, 'w') as f: f.write('\n') self.files_to_remove.append(fp2) forward_filepath_id = convert_to_id('raw_forward_seqs', 'filepath_type') barcode_filepath_id = convert_to_id('raw_barcodes', 'filepath_type') fps = [(fp1, forward_filepath_id), (fp2, barcode_filepath_id)] filetype_id = get_filetypes()['per_sample_FASTQ'] raw_data = RawData.create(filetype_id, [prep_template], fps) params = [p for p in list(PreprocessedIlluminaParams.iter()) if p.name == 'per sample FASTQ defaults'][0] with self.assertRaises(ValueError): _get_preprocess_fastq_cmd(raw_data, prep_template, params)
def test_values(self): obs = PreprocessedIlluminaParams(1).values exp = {'max_barcode_errors': 1.5, 'sequence_max_n': 0, 'max_bad_run_length': 3, 'rev_comp': False, 'phred_quality_threshold': 3, 'rev_comp_barcode': False, 'rev_comp_mapping_barcodes': False, 'min_per_read_length_fraction': 0.75, 'barcode_type': 'golay_12'} self.assertEqual(obs, exp)
def render(self, prep, study_id, is_editable, ena_terms, study_status, user_defined_terms): # Check if the request came from a local source is_local_request = self._is_local() prep_id = prep.id status_class1, status_class2, status_color = STATUS_STYLER[prep.status] data_type = prep.data_type() raw_data = RawData(prep.raw_data) filepaths = prep.get_filepaths() investigation_type = prep.investigation_type preprocessed_data = prep.preprocessed_data preprocessing_status = prep.preprocessing_status if raw_data.filetype in ('SFF', 'FASTA'): param_iter = Preprocessed454Params.iter() elif raw_data.filetype == 'FASTQ': param_iter = PreprocessedIlluminaParams.iter() else: raise ValueError("Don't know what to do but this exception will " "never actually get shown anywhere because why " "would you want to see tracebacks?") preprocess_options = [] for param in param_iter: text = ("<b>%s:</b> %s" % (k, v) for k, v in viewitems(param.values)) preprocess_options.append((param.id, param.name, '<br>'.join(text))) # Unfortunately, both the prep template and the qiime mapping files # have the sample type. The way to differentiate them is if we have # the substring 'qiime' in the basename _fp_type = (lambda fp: "Qiime mapping" if 'qiime' in basename(fp) else "Prep template") filepaths = [(id_, fp, _fp_type(fp)) for id_, fp in filepaths] return self.render_string( "study_description_templates/prep_template_panel.html", prep_id=prep_id, status_class1=status_class1, status_class2=status_class2, status_color=status_color, data_type=data_type, filepaths=filepaths, investigation_type=investigation_type, preprocessed_data=preprocessed_data, preprocessing_status=preprocessing_status, study_id=study_id, is_local_request=is_local_request, is_editable=is_editable, ena_terms=ena_terms, study_status=study_status, user_defined_terms=user_defined_terms, preprocess_options=preprocess_options)
def test_check_columns(self): # Check missing columns with self.assertRaises(ValueError): PreprocessedIlluminaParams._check_columns(barcode_type=8) # Check extra columns with self.assertRaises(ValueError): PreprocessedIlluminaParams._check_columns( max_bad_run_length=3, min_per_read_length_fraction=0.75, sequence_max_n=0, rev_comp_barcode=False, rev_comp_mapping_barcodes=False, rev_comp=False, phred_quality_threshold=3, barcode_type="hamming_8", max_barcode_errors=1.5, extra_columns="Foo") # Does not raise any error PreprocessedIlluminaParams._check_columns( max_bad_run_length=3, min_per_read_length_fraction=0.75, sequence_max_n=0, rev_comp_barcode=False, rev_comp_mapping_barcodes=False, rev_comp=False, phred_quality_threshold=3, barcode_type="hamming_8", max_barcode_errors=1.5)
def render(self, prep, study_id, is_editable, ena_terms, study_status, user_defined_terms): # Check if the request came from a local source is_local_request = self._is_local() prep_id = prep.id data_type = prep.data_type() raw_data = RawData(prep.raw_data) filepaths = prep.get_filepaths() investigation_type = prep.investigation_type preprocessed_data = prep.preprocessed_data preprocessing_status = prep.preprocessing_status if raw_data.filetype in ('SFF', 'FASTA'): param_iter = Preprocessed454Params.iter() elif raw_data.filetype == 'FASTQ': param_iter = PreprocessedIlluminaParams.iter() else: raise ValueError("Don't know what to do but this exception will " "never actually get shown anywhere because why " "would you want to see tracebacks?") preprocess_options = [] for param in param_iter: text = ("<b>%s:</b> %s" % (k, v) for k, v in viewitems(param.values)) preprocess_options.append( (param.id, param.name, '<br>'.join(text))) # Unfortunately, both the prep template and the qiime mapping files # have the sample type. The way to differentiate them is if we have # the substring 'qiime' in the basename _fp_type = (lambda fp: "Qiime mapping" if 'qiime' in basename(fp) else "Prep template") filepaths = [(id_, fp, _fp_type(fp)) for id_, fp in filepaths] return self.render_string( "study_description_templates/prep_template_panel.html", prep_id=prep_id, data_type=data_type, filepaths=filepaths, investigation_type=investigation_type, preprocessed_data=preprocessed_data, preprocessing_status=preprocessing_status, study_id=study_id, is_local_request=is_local_request, is_editable=is_editable, ena_terms=ena_terms, study_status=study_status, user_defined_terms=user_defined_terms, preprocess_options=preprocess_options)
def test_exists(self): obs = PreprocessedIlluminaParams.exists( max_bad_run_length=3, min_per_read_length_fraction=0.75, sequence_max_n=0, rev_comp_barcode=False, rev_comp_mapping_barcodes=False, rev_comp=False, phred_quality_threshold=3, barcode_type="golay_12", max_barcode_errors=1.5) self.assertTrue(obs) obs = PreprocessedIlluminaParams.exists( max_bad_run_length=3, min_per_read_length_fraction=0.75, sequence_max_n=0, rev_comp_barcode=False, rev_comp_mapping_barcodes=False, rev_comp=False, phred_quality_threshold=3, barcode_type="hamming_8", max_barcode_errors=1.5) self.assertFalse(obs)
def test_get_preprocess_fastq_cmd_per_sample_FASTQ(self): metadata_dict = { 'SKB8.640193': {'run_prefix': "sample1", 'primer': 'A', 'barcode': 'A', 'center_name': 'ANL', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'A', 'experiment_design_description': 'A'}, 'SKD8.640184': {'run_prefix': "sample2", 'primer': 'A', 'barcode': 'A', 'center_name': 'ANL', 'platform': 'ILLUMINA', 'instrument_model': 'Illumina MiSeq', 'library_construction_protocol': 'A', 'experiment_design_description': 'A'}} md_template = pd.DataFrame.from_dict(metadata_dict, orient='index') prep_template = PrepTemplate.create(md_template, Study(1), '16S') fp1 = self.path_builder('sample1.fastq') with open(fp1, 'w') as f: f.write('\n') self.files_to_remove.append(fp1) fp2 = self.path_builder('sample2.fastq.gz') with open(fp2, 'w') as f: f.write('\n') self.files_to_remove.append(fp2) filepath_id = convert_to_id('raw_forward_seqs', 'filepath_type') fps = [(fp1, filepath_id), (fp2, filepath_id)] filetype_id = get_filetypes()['per_sample_FASTQ'] raw_data = RawData.create(filetype_id, [prep_template], fps) params = [p for p in list(PreprocessedIlluminaParams.iter()) if p.name == 'per sample FASTQ defaults'][0] obs_cmd, obs_output_dir = _get_preprocess_fastq_cmd(raw_data, prep_template, params) raw_fps = ','.join([fp for _, fp, _ in sorted(raw_data.get_filepaths())]) exp_cmd = ( "split_libraries_fastq.py --store_demultiplexed_fastq -i " "{} --sample_ids 1.SKB8.640193,1.SKD8.640184 -o {} --barcode_type " "not-barcoded --max_bad_run_length 3 --max_barcode_errors 1.5 " "--min_per_read_length_fraction 0.75 --phred_quality_threshold 3 " "--sequence_max_n 0").format(raw_fps, obs_output_dir) self.assertEqual(obs_cmd, exp_cmd)
def test_create(self): obs_obj = PreprocessedIlluminaParams.create( "test_create", max_bad_run_length="3", min_per_read_length_fraction="0.75", sequence_max_n="0", rev_comp_barcode="False", rev_comp_mapping_barcodes="False", rev_comp="False", phred_quality_threshold="3", barcode_type="hamming_8", max_barcode_errors="1.5") obs = obs_obj.to_str() exp = ("--barcode_type hamming_8 --max_bad_run_length 3 " "--max_barcode_errors 1.5 --min_per_read_length_fraction 0.75 " "--phred_quality_threshold 3 --sequence_max_n 0") self.assertEqual(obs, exp)
def render(self, prep, study_id, is_editable, ena_terms, study_status, user_defined_terms, raw_data_files): # Check if the request came from a local source is_local_request = self._is_local() prep_id = prep.id status_class1, status_class2, status_color = STATUS_STYLER[prep.status] data_type = prep.data_type() raw_data = RawData(prep.raw_data) filepaths = prep.get_filepaths() investigation_type = prep.investigation_type preprocessed_data = prep.preprocessed_data preprocessing_status = prep.preprocessing_status if raw_data.filetype in ('SFF', 'FASTA'): param_iter = Preprocessed454Params.iter() elif raw_data.filetype == 'FASTQ': param_iter = PreprocessedIlluminaParams.iter() else: raise ValueError("Don't know what to do but this exception will " "never actually get shown anywhere because why " "would you want to see tracebacks?") preprocess_options = [] for param in param_iter: text = ("<b>%s:</b> %s" % (k, v) for k, v in viewitems(param.values)) preprocess_options.append((param.id, param.name, '<br>'.join(text))) # Unfortunately, both the prep template and the qiime mapping files # have the sample type. The way to differentiate them is if we have # the substring 'qiime' in the basename _fp_type = (lambda fp: "Qiime mapping" if 'qiime' in basename(fp) else "Prep template") filepaths = [(id_, fp, _fp_type(fp)) for id_, fp in filepaths] # Check if the template have all the required columns for preprocessing if prep.data_type() in TARGET_GENE_DATA_TYPES: key = ('demultiplex_multiple' if len(raw_data_files) > 2 else 'demultiplex') missing_cols = prep.check_restrictions( [PREP_TEMPLATE_COLUMNS_TARGET_GENE[key]]) show_preprocess_btn = len(missing_cols) == 0 if not show_preprocess_btn: no_preprocess_msg = ( "Preprocessing disabled due to missing columns in the " "prep template: %s" % ', '.join(missing_cols)) else: no_preprocess_msg = None else: show_preprocess_btn = True no_preprocess_msg = None return self.render_string( "study_description_templates/prep_template_panel.html", prep_id=prep_id, status_class1=status_class1, status_class2=status_class2, status_color=status_color, data_type=data_type, filepaths=filepaths, investigation_type=investigation_type, preprocessed_data=preprocessed_data, preprocessing_status=preprocessing_status, study_id=study_id, is_local_request=is_local_request, is_editable=is_editable, ena_terms=ena_terms, study_status=study_status, user_defined_terms=user_defined_terms, preprocess_options=preprocess_options, show_preprocess_btn=show_preprocess_btn, no_preprocess_msg=no_preprocess_msg)
def render(self, study, prep_template, full_access, ena_terms, user_defined_terms): user = self.current_user is_local_request = is_localhost(self.request.headers['host']) template_fps = [] qiime_fps = [] # Unfortunately, both the prep template and the qiime mapping files # have the sample type. The way to differentiate them is if we have # the substring 'qiime' in the basename for id_, fp in prep_template.get_filepaths(): if 'qiime' in basename(fp): qiime_fps.append( download_link_or_path( is_local_request, fp, id_, 'Qiime mapping')) else: template_fps.append( download_link_or_path( is_local_request, fp, id_, 'Prep template')) # Since get_filepaths returns the paths sorted from newest to oldest, # the first in both list is the latest one current_template_fp = template_fps[0] current_qiime_fp = qiime_fps[0] if len(template_fps) > 1: show_old_templates = True old_templates = template_fps[1:] else: show_old_templates = False old_templates = None if len(qiime_fps) > 1: show_old_qiime_fps = True old_qiime_fps = qiime_fps[1:] else: show_old_qiime_fps = False old_qiime_fps = None filetypes = sorted( ((ft, ft_id, fp_type_by_ft[ft]) for ft, ft_id in viewitems(get_filetypes())), key=itemgetter(1)) files = [f for _, f in get_files_from_uploads_folders(str(study.id))] other_studies_rd = sorted(viewitems( _get_accessible_raw_data(user))) # A prep template can be modified if its status is sandbox is_editable = prep_template.status == 'sandbox' raw_data_id = prep_template.raw_data preprocess_options = [] preprocessed_data = None show_preprocess_btn = True no_preprocess_msg = None if raw_data_id: rd = RawData(raw_data_id) rd_ft = rd.filetype # If the prep template has a raw data associated, it can be # preprocessed. Retrieve the pre-processing parameters if rd_ft in ('SFF', 'FASTA'): param_iter = Preprocessed454Params.iter() elif rd_ft == 'FASTQ': param_iter = [pip for pip in PreprocessedIlluminaParams.iter() if pip.values['barcode_type'] != 'not-barcoded'] elif rd_ft == 'per_sample_FASTQ': param_iter = [pip for pip in PreprocessedIlluminaParams.iter() if pip.values['barcode_type'] == 'not-barcoded'] else: raise NotImplementedError( "Pre-processing of %s files currently not supported." % rd_ft) preprocess_options = [] for param in param_iter: text = ("<b>%s:</b> %s" % (k, v) for k, v in viewitems(param.values)) preprocess_options.append((param.id, param.name, '<br>'.join(text))) preprocessed_data = prep_template.preprocessed_data # Check if the template have all the required columns for # preprocessing raw_data_files = rd.get_filepaths() if len(raw_data_files) == 0: show_preprocess_btn = False no_preprocess_msg = ( "Preprocessing disabled because there are no files " "linked with the Raw Data") else: if prep_template.data_type() in TARGET_GENE_DATA_TYPES: raw_forward_fps = [fp for _, fp, ftype in raw_data_files if ftype == 'raw_forward_seqs'] key = ('demultiplex_multiple' if len(raw_forward_fps) > 1 else 'demultiplex') missing_cols = prep_template.check_restrictions( [PREP_TEMPLATE_COLUMNS_TARGET_GENE[key]]) if rd_ft == 'per_sample_FASTQ': show_preprocess_btn = 'run_prefix' not in missing_cols else: show_preprocess_btn = len(missing_cols) == 0 no_preprocess_msg = None if not show_preprocess_btn: no_preprocess_msg = ( "Preprocessing disabled due to missing columns in " "the prep template: %s" % ', '.join(missing_cols)) preprocessing_status = prep_template.preprocessing_status return self.render_string( "study_description_templates/prep_template_info_tab.html", pt_id=prep_template.id, study_id=study.id, raw_data=raw_data_id, current_template_fp=current_template_fp, current_qiime_fp=current_qiime_fp, show_old_templates=show_old_templates, old_templates=old_templates, show_old_qiime_fps=show_old_qiime_fps, old_qiime_fps=old_qiime_fps, filetypes=filetypes, files=files, other_studies_rd=other_studies_rd, prep_template=prep_template, study=study, ena_terms=ena_terms, user_defined_terms=user_defined_terms, investigation_type=prep_template.investigation_type, is_editable=is_editable, preprocess_options=preprocess_options, preprocessed_data=preprocessed_data, preprocessing_status=preprocessing_status, show_preprocess_btn=show_preprocess_btn, no_preprocess_msg=no_preprocess_msg)
def test_iter(self): obs = list(PreprocessedIlluminaParams.iter()) exp = [PreprocessedIlluminaParams(1)] for o, e in zip(obs, exp): self.assertEqual(o.id, e.id)
def test_name(self): obs = PreprocessedIlluminaParams(1).name self.assertEqual(obs, "Defaults")