def test_shogun_utree(self): # inserting new prep template prep_info_dict = { 'SKB8.640193': {'run_prefix': 'S22205_S104'}, 'SKD8.640184': {'run_prefix': 'S22282_S102'}} data = {'prep_info': dumps(prep_info_dict), # magic #1 = testing study 'study': 1, 'data_type': 'Metagenomic'} pid = self.qclient.post('/apitest/prep_template/', data=data)['prep'] # inserting artifacts fp1_1, fp1_2, fp2_1, fp2_2 = self._helper_shogun_bowtie() data = { 'filepaths': dumps([ (fp1_1, 'raw_forward_seqs'), (fp1_2, 'raw_reverse_seqs'), (fp2_1, 'raw_forward_seqs'), (fp2_2, 'raw_reverse_seqs')]), 'type': "per_sample_FASTQ", 'name': "Test Shogun artifact", 'prep': pid} aid = self.qclient.post('/apitest/artifact/', data=data)['artifact'] self.params['input'] = aid self.params['Aligner tool'] = 'utree' data = {'user': '******', 'command': dumps(['qp-shogun', '012020', 'Shogun v1.0.7']), 'status': 'running', 'parameters': dumps(self.params)} jid = self.qclient.post('/apitest/processing_job/', data=data)['job'] out_dir = mkdtemp() self._clean_up_files.append(out_dir) success, ainfo, msg = shogun(self.qclient, jid, self.params, out_dir) self.assertEqual("", msg) self.assertTrue(success) # we are expecting 1 artifacts in total pout_dir = partial(join, out_dir) self.assertCountEqual(ainfo, [ ArtifactInfo('Shogun Alignment Profile', 'BIOM', [(pout_dir('otu_table.alignment.profile.biom'), 'biom'), (pout_dir('alignment.utree.tsv.xz'), 'log')]), ArtifactInfo('Taxonomic Predictions - phylum', 'BIOM', [(pout_dir('otu_table.redist.phylum.biom'), 'biom')]), ArtifactInfo('Taxonomic Predictions - genus', 'BIOM', [(pout_dir('otu_table.redist.genus.biom'), 'biom')]), ArtifactInfo('Taxonomic Predictions - species', 'BIOM', [(pout_dir('otu_table.redist.species.biom'), 'biom')])])
def _per_sample_ainfo(out_dir, samples, fwd_and_rev=False): files = [] missing_files = [] if fwd_and_rev: suffixes = [ '%s_paired_1.fastq', '%s_paired_2.fastq', '%s_unmatched_1.fastq', '%s_unmatched_2.fastq' ] else: suffixes = ['%s.fastq'] for rp, _, _, _ in samples: smd = partial(join, out_dir, rp) for suff in suffixes: fname = smd(suff % rp) if exists(fname): files.append(fname) else: missing_files.append(fname) if not files: # KneadData did not create any files, which means that no sequence # was kept after quality control and filtering for host data raise ValueError("No sequences left after running KneadData") # Generate the missing files for f in missing_files: open(f, 'w', 0).close() files.append(f) # Gzip all the files files = [(_gzip_file(f), 'preprocessed_fastq') for f in files] return [ArtifactInfo('KneadData files', 'per_sample_FASTQ', files)]
def test_validate_multiple_single_lane(self): test_dir = mkdtemp() self._clean_up_files.append(test_dir) copyfile(self.fastq, f'{test_dir}/prefix1.fastq') copyfile(self.fastq, f'{test_dir}/prefix1_b.fastq') prep_info = { "1.SKB2.640194": { "not_a_run_prefix": "prefix1" }, "1.SKM4.640180": { "not_a_run_prefix": "prefix1" }, "1.SKB3.640195": { "not_a_run_prefix": "prefix2" } } files = { 'raw_forward_seqs': [f'{test_dir}/prefix1.fastq'], 'raw_barcodes': [f'{test_dir}/prefix1_b.fastq'] } atype = "FASTQ" job_id, _ = self._create_template_and_job(prep_info, files, atype) obs_success, obs_ainfo, obs_error = _validate_multiple( self.qclient, job_id, prep_info, files, atype) self.assertEqual(obs_error, "") self.assertTrue(obs_success) filepaths = [(f'{test_dir}/prefix1_b.fastq.gz', 'raw_barcodes'), (f'{test_dir}/prefix1.fastq.gz', 'raw_forward_seqs')] exp = [ArtifactInfo(None, atype, filepaths)] self.assertEqual(obs_ainfo, exp)
def func(qclient, job_id, job_params, working_dir): fp = join(working_dir, 'test.fastq') with open(fp, 'w') as f: f.write('') res = ArtifactInfo('out1', 'Demultiplexed', [[fp, 'preprocessed_fastq']]) return True, "", [res]
def test_validate_run_prefix(self): prep_info = { 'SKB8.640193': { 'col': 'val1', 'run_prefix': 'Sample1' }, 'SKD8.640184': { 'col': 'val2', 'run_prefix': 'Sample2' } } data = {'prep_info': dumps(prep_info), 'study': 1, 'data_type': '16S'} res = self.qclient.post('/apitest/prep_template/', data=data) sample_ids = ['Sample1', 'Sample2'] biom_fp, job_id, parameters = self._create_job_and_biom( sample_ids, template=res['prep']) obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) exp_fp = partial(join, self.out_dir) exp_biom_fp = exp_fp(basename(biom_fp)) exp_index_fp = exp_fp('index.html') exp_viz_fp = exp_fp('support_files') exp_qza_fp = exp_fp('feature-table.qza') self._clean_up_files.append(exp_biom_fp) self.assertTrue(obs_success) self.assertEqual(obs_ainfo, [ ArtifactInfo(None, 'BIOM', [(exp_biom_fp, 'biom'), (exp_index_fp, 'html_summary'), (exp_viz_fp, 'html_summary_dir'), (exp_qza_fp, 'qza')]) ]) self.assertEqual(obs_error, "") obs_t = load_table(exp_biom_fp) self.assertCountEqual(obs_t.ids(), ["1.SKB8.640193", "1.SKD8.640184"])
def test_validate_FeatureData(self): # Create the feature data fd, fd_fp = mkstemp(suffix='.txt', dir=self.out_dir) close(fd) with open(fd_fp, 'w') as f: f.write("Feature ID\tTaxonomy\tConfidence\n") f.write("TACGGAGGA\tk__Bacteria;p__Bacteroidetes;c__Bacteroidia\t" "0.9998743\n") f.write("TACGTAGGG\tk__Bacteria;p__Firmicutes;c__Clostridia\t" "0.9999999\n") # Test success obs_success, obs_ainfo, obs_error = _validate_feature_data( {'plain_text': [fd_fp]}, None, self.out_dir) self.assertEqual(obs_error, "") self.assertTrue(obs_success) exp_ainfo = [ ArtifactInfo(None, "FeatureData", [(fd_fp, 'plain_text')]) ] self.assertEqual(obs_ainfo, exp_ainfo) # Test failure wrong format fd, fd_fp = mkstemp(suffix='.txt', dir=self.out_dir) close(fd) with open(fd_fp, 'w') as f: f.write("Feature ID\tIt's gonna fail!\tConfidence\n") f.write("TACGGAGGA\tk__Bacteria;p__Bacteroidetes;c__Bacteroidia\t" "0.9998743\n") f.write("TACGTAGGG\tk__Bacteria;p__Firmicutes;c__Clostridia\t" "0.9999999\n") obs_success, obs_ainfo, obs_error = _validate_feature_data( {'plain_text': [fd_fp]}, None, self.out_dir) self.assertIn("The file header seems wrong", obs_error) self.assertFalse(obs_success) self.assertIsNone(obs_ainfo)
def test_validate_multiple_single_lane(self): prep_info = { "1.SKB2.640194": { "not_a_run_prefix": "prefix1" }, "1.SKM4.640180": { "not_a_run_prefix": "prefix1" }, "1.SKB3.640195": { "not_a_run_prefix": "prefix2" } } files = { 'raw_forward_seqs': ['/path/to/prefix1.fastq'], 'raw_barcodes': ['/path/to/prefix1_b.fastq'] } atype = "FASTQ" job_id = self._create_template_and_job(prep_info, files, atype) obs_success, obs_ainfo, obs_error = _validate_multiple( self.qclient, job_id, prep_info, files, atype) self.assertTrue(obs_success) filepaths = [('/path/to/prefix1_b.fastq', 'raw_barcodes'), ('/path/to/prefix1.fastq', 'raw_forward_seqs')] exp = [ArtifactInfo(None, atype, filepaths)] self.assertEqual(obs_ainfo, exp) self.assertEqual(obs_error, "")
def test_validate_distance_matrix(self): # Create a distance matrix sample_ids = [ '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', '1.SKM9.640192', '1.SKB7.640196' ] dm_fp = self._create_distance_matrix(sample_ids) # Test success obs_success, obs_ainfo, obs_error = _validate_distance_matrix( {'plain_text': [dm_fp]}, self.metadata, self.out_dir) self.assertTrue(obs_success) exp_ainfo = [ ArtifactInfo(None, "distance_matrix", [(dm_fp, 'plain_text')]) ] self.assertEqual(obs_ainfo, exp_ainfo) self.assertEqual(obs_error, "") # Test failure sample_ids = [ '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', '1.SKM9.640192', 'NotASample' ] dm_fp = self._create_distance_matrix(sample_ids) obs_success, obs_ainfo, obs_error = _validate_distance_matrix( {'plain_text': [dm_fp]}, self.metadata, self.out_dir) self.assertFalse(obs_success) self.assertIsNone(obs_ainfo) self.assertEqual( obs_error, "The distance matrix contain samples not " "present in the metadata")
def test_validate_per_sample_FASTQ(self): prep_info = { "1.SKB2.640194": { "not_a_run_prefix": "prefix1" }, "1.SKM4.640180": { "not_a_run_prefix": "prefix1" }, "1.SKB3.640195": { "not_a_run_prefix": "prefix2" } } files = { 'raw_forward_seqs': [ '/path/to/SKB2.640194_file.fastq', '/path/to/SKM4.640180_file.fastq', '/path/to/SKB3.640195_file.fastq' ] } job_id = self._create_template_and_job(prep_info, files, "per_sample_FASTQ") obs_success, obs_ainfo, obs_error = _validate_per_sample_FASTQ( self.qclient, job_id, prep_info, files) self.assertTrue(obs_success) filepaths = [('/path/to/SKB2.640194_file.fastq', 'raw_forward_seqs'), ('/path/to/SKM4.640180_file.fastq', 'raw_forward_seqs'), ('/path/to/SKB3.640195_file.fastq', 'raw_forward_seqs')] exp = [ArtifactInfo(None, "per_sample_FASTQ", filepaths)] self.assertEqual(obs_ainfo, exp) self.assertEqual(obs_error, "")
def test_validate_multiple(self): prep_info = { '1.SKB2.640194': { 'run_prefix': 'prefix1' }, '1.SKM4.640180': { 'run_prefix': 'prefix1' }, '1.SKB3.640195': { 'run_prefix': 'prefix2' } } files = { 'raw_forward_seqs': ['/path/to/prefix1.fastq', '/path/to/prefix2.fastq'], 'raw_barcodes': ['/path/to/prefix1_b.fastq', '/path/to/prefix2_b.fastq'] } atype = "FASTQ" job_id = self._create_template_and_job(prep_info, files, atype) obs_success, obs_ainfo, obs_error = _validate_multiple( self.qclient, job_id, prep_info, files, atype) self.assertTrue(obs_success) filepaths = [('/path/to/prefix1_b.fastq', 'raw_barcodes'), ('/path/to/prefix2_b.fastq', 'raw_barcodes'), ('/path/to/prefix1.fastq', 'raw_forward_seqs'), ('/path/to/prefix2.fastq', 'raw_forward_seqs')] exp = [ArtifactInfo(None, "FASTQ", filepaths)] self.assertEqual(obs_ainfo, exp) self.assertEqual(obs_error, "")
def _per_sample_ainfo(out_dir, samples, suffixes, prg_name, files_type_name, fwd_and_rev=False): files = [] missing_files = [] smd = partial(join, out_dir) for rp, _, _, _ in samples: for suff in suffixes: fname = smd(suff % rp) if exists(fname): if fname.endswith('R1.fastq.gz'): ftype = 'raw_forward_seqs' elif fname.endswith('R2.fastq.gz'): ftype = 'raw_reverse_seqs' else: # this should never happen and it's not really possible # to reproduce so no tests! raise ValueError('File %s has an unexpected name' % fname) files.append((fname, ftype)) else: missing_files.append(fname) if not files: # Command did not create any files, which means that no sequence # was kept after quality control and filtering for host data raise ValueError("No sequences left after %s" % prg_name) return [ArtifactInfo(files_type_name, 'per_sample_FASTQ', files)]
def test_validate_prefix(self): prep_info = { 'SKB8.640193': { 'col': 'val1' }, 'SKD8.640184': { 'col': 'val2' } } data = {'prep_info': dumps(prep_info), 'study': 1, 'data_type': '16S'} res = self.qclient.post('/apitest/prep_template/', data=data) sample_ids = ['SKB8.640193', 'SKD8.640184'] biom_fp, job_id, parameters = self._create_job_and_biom( sample_ids, template=res['prep']) obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) exp_biom_fp = join(self.out_dir, basename(biom_fp)) self._clean_up_files.append(exp_biom_fp) self.assertTrue(obs_success) self.assertEqual(obs_ainfo, [ArtifactInfo(None, 'BIOM', [(exp_biom_fp, 'biom')])]) self.assertEqual(obs_error, "") obs_t = load_table(exp_biom_fp) self.assertCountEqual(obs_t.ids(), ['1.SKB8.640193', '1.SKD8.640184'])
def _validate_alpha_vector(files, metadata, out_dir): # Magic number [0] -> there is only one plain text file, which is the # ordination results alpha_vector = files['plain_text'][0] alpha_qza = None if 'qza' in files: alpha_qza = files['qza'][0] # Parse the sample ids from the alphe_vector file alpha_ids = [] with open(alpha_vector) as f: # Ignore the header line f.readline() for line in f: vals = line.strip().split('\t') if len(vals) != 2: return (False, None, "The alpha vector format is incorrect") alpha_ids.append(vals[0]) metadata_ids = set(metadata) alpha_ids = set(alpha_ids) if not metadata_ids.issuperset(alpha_ids): return (False, None, "The alpha vector contains samples not present " "in the metadata") filepaths = [(alpha_vector, 'plain_text')] if alpha_qza is not None: filepaths.append((alpha_qza, 'qza')) return True, [ArtifactInfo(None, 'alpha_vector', filepaths)], ""
def test_validate_ordination_results(self): # Create the ordination results sample_ids = [ '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', '1.SKM9.640192', '1.SKB7.640196' ] ord_res_fp = self._create_ordination_results(sample_ids) # Test success obs_success, obs_ainfo, obs_error = _validate_ordination_results( {'plain_text': [ord_res_fp]}, self.metadata, self.out_dir) self.assertTrue(obs_success) exp_ainfo = [ ArtifactInfo(None, "ordination_results", [(ord_res_fp, 'plain_text')]) ] self.assertEqual(obs_ainfo, exp_ainfo) self.assertEqual(obs_error, "") # Test failure sample_ids = [ '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', '1.SKM9.640192', 'NotASample' ] ord_res_fp = self._create_ordination_results(sample_ids) obs_success, obs_ainfo, obs_error = _validate_ordination_results( {'plain_text': [ord_res_fp]}, self.metadata, self.out_dir) self.assertFalse(obs_success) self.assertIsNone(obs_ainfo) self.assertEqual( obs_error, "The ordination results contain samples " "not present in the metadata")
def test_validate_no_changes(self): sample_ids = [ '1.SKB2.640194', '1.SKM4.640180', '1.SKB3.640195', '1.SKB6.640176', '1.SKD6.640190', '1.SKM6.640187', '1.SKD9.640182', '1.SKM8.640201', '1.SKM2.640199', '1.SKD2.640178', '1.SKB7.640196', '1.SKD4.640185', '1.SKB8.640193', '1.SKM3.640197', '1.SKD5.640186', '1.SKB1.640202', '1.SKM1.640183', '1.SKD1.640179', '1.SKD3.640198', '1.SKB5.640181', '1.SKB4.640189', '1.SKB9.640200', '1.SKM9.640192', '1.SKD8.640184', '1.SKM5.640177', '1.SKM7.640188', '1.SKD7.640191' ] biom_fp, job_id, parameters = self._create_job_and_biom(sample_ids, template=1) obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) exp_fp = partial(join, self.out_dir) exp_index_fp = exp_fp('index.html') exp_viz_fp = exp_fp('support_files') exp_qza_fp = exp_fp('feature-table.qza') self.assertTrue(obs_success) self.assertEqual(obs_ainfo, [ ArtifactInfo(None, 'BIOM', [(biom_fp, 'biom'), (exp_index_fp, 'html_summary'), (exp_viz_fp, 'html_summary_dir'), (exp_qza_fp, 'qza')]) ]) self.assertEqual(obs_error, "")
def test_validate_demux_file(self): demux_fp, _, out_dir = self._generate_files({'s1': 's1', 's2': 's2'}) prep_info = { "1.SKB2.640194": { "run_prefix": "s1" }, "1.SKM4.640180": { "run_prefix": "s2" }, "1.SKB3.640195": { "run_prefix": "s3" }, "1.SKB6.640176": { "run_prefix": "s4" } } files = {'preprocessed_demux': demux_fp} job_id, _ = self._create_template_and_job(prep_info, files, "Demultiplexed") obs_success, obs_ainfo, obs_error = _validate_demux_file( self.qclient, job_id, prep_info, out_dir, demux_fp) self.assertTrue(obs_success) name = splitext(basename(demux_fp))[0] exp_fastq_fp = join(out_dir, "%s.fastq.gz" % name) exp_fasta_fp = join(out_dir, "%s.fasta.gz" % name) exp_demux_fp = join(out_dir, basename(demux_fp)) filepaths = [(exp_fastq_fp, 'preprocessed_fastq'), (exp_fasta_fp, 'preprocessed_fasta'), (exp_demux_fp, 'preprocessed_demux')] exp = [ArtifactInfo(None, "Demultiplexed", filepaths)] self.assertEqual(obs_ainfo, exp) self.assertEqual(obs_error, "") with File(exp_demux_fp) as f: self.assertCountEqual(f.keys(), ["1.SKB2.640194", "1.SKM4.640180"])
def test_validate_per_sample_FASTQ_run_prefix(self): f1 = join(self.source_dir, 'SKB2.640194_file.fastq') f2 = join(self.source_dir, 'SKM4.640180_file.fastq') f3 = join(self.source_dir, 'SKB3.640195_file.fastq') raw_files = [f1, f2, f3] for x in raw_files: copyfile(self.fastq, x) self._clean_up_files.append(x) prep_info = { "1.SKB2.640194": { "run_prefix": "prefix1" }, "1.SKM4.640180": { "run_prefix": "prefix2" }, "1.SKB3.640195": { "run_prefix": "prefix3" } } files = {'raw_forward_seqs': raw_files} job_id, _ = self._create_template_and_job(prep_info, files, "per_sample_FASTQ") obs_success, obs_ainfo, obs_error = _validate_per_sample_FASTQ( self.qclient, job_id, prep_info, files) self.assertEqual(obs_error, "") self.assertTrue(obs_success) filepaths = [('%s.gz' % x, 'raw_forward_seqs') for x in raw_files] exp = [ArtifactInfo(None, "per_sample_FASTQ", filepaths)] self.assertEqual(obs_ainfo, exp)
def spades(qclient, job_id, parameters, out_dir): """Run spades with the given parameters Parameters ---------- qclient : tgp.qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run split libraries out_dir : str The path to the job's output directory Returns ------- bool, list, str The results of the job """ msg = "Step 3 of 4: Checking resulting files" qclient.update_job_step(job_id, msg) artifact_id = parameters['input'] artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) # Get the artifact metadata prep_info = qclient.get('/qiita_db/prep_template/%s/' % artifact_info['prep_information'][0]) df = pd.read_csv(prep_info['prep-file'], sep='\t', dtype='str', na_values=[], keep_default_na=True) snames = df.sample_name.values missing = [] outfiles = [] for sname in snames: scaffold = join(out_dir, sname, 'scaffolds.fasta') if exists(scaffold): new_scaffold = join(out_dir, sname, f'{sname}.fasta') run(['mv', scaffold, new_scaffold], stdout=PIPE) outfiles.append((new_scaffold, 'preprocessed_fasta')) else: missing.append(sname) if missing: error_msg = ('There was no scaffolds.fasta for samples: %s. Contact: ' '[email protected] and add this job id: %s' % (', '.join(missing), job_id)) return False, None, error_msg # Step 4 generating artifacts msg = "Step 4 of 4: Generating new artifact" qclient.update_job_step(job_id, msg) ainfo = [ ArtifactInfo('Preprocessed FASTA', 'FASTA_preprocessed', outfiles) ] return True, ainfo, ""
def test_validate_representative_set(self): sample_ids = [ '1.SKB2.640194', '1.SKM4.640180', '1.SKB3.640195', '1.SKB6.640176', '1.SKD6.640190', '1.SKM6.640187', '1.SKD9.640182', '1.SKM8.640201', '1.SKM2.640199' ] biom_fp, job_id, parameters = self._create_job_and_biom(sample_ids, template=1) fd, fasta_fp = mkstemp(suffix=".fna") close(fd) with open(fasta_fp, 'w') as f: f.write(">O1 something\nACTG\n>O2\nATGC\n") self._clean_up_files.append(fasta_fp) parameters = { 'template': parameters['template'], 'files': dumps({ 'biom': [biom_fp], 'preprocessed_fasta': [fasta_fp] }), 'artifact_type': 'BIOM' } obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) self.assertTrue(obs_success) files = [(biom_fp, 'biom'), (fasta_fp, 'preprocessed_fasta')] self.assertEqual(obs_ainfo, [ArtifactInfo(None, 'BIOM', files)]) self.assertEqual(obs_error, "") # Extra ids with open(fasta_fp, 'w') as f: f.write(">O1 something\nACTG\n>O2\nATGC\n>O3\nATGC\n") obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) self.assertFalse(obs_success) self.assertIsNone(obs_ainfo) self.assertEqual( obs_error, "The representative set sequence file includes observations not " "found in the BIOM table: O3") # Missing ids with open(fasta_fp, 'w') as f: f.write(">O1 something\nACTG\n") obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) self.assertFalse(obs_success) self.assertIsNone(obs_ainfo) self.assertEqual( obs_error, "The representative set sequence file is missing observation ids " "found in the BIOM tabe: O2")
def test_validate_no_changes_superset(self): sample_ids = [ '1.SKB2.640194', '1.SKM4.640180', '1.SKB3.640195', '1.SKB6.640176', '1.SKD6.640190', '1.SKM6.640187', '1.SKD9.640182', '1.SKM8.640201', '1.SKM2.640199' ] biom_fp, job_id, parameters = self._create_job_and_biom(sample_ids, template=1) obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) self.assertTrue(obs_success) self.assertEqual(obs_ainfo, [ArtifactInfo(None, 'BIOM', [(biom_fp, 'biom')])]) self.assertEqual(obs_error, "")
def _validate_ordination_results(files, metadata, out_dir): # Magic number [0] -> there is only one plain text file, which is the # ordination results ord_res_fp = files['plain_text'][0] ord_res = OrdinationResults.read(ord_res_fp) # Get the ids of the ordination results and the metadata ord_res_ids = set(ord_res.samples.index) metadata_ids = set(metadata) if not metadata_ids.issuperset(ord_res_ids): return (False, None, "The ordination results contain samples not " "present in the metadata") filepaths = [(ord_res_fp, 'plain_text')] return True, [ArtifactInfo(None, 'ordination_results', filepaths)], ""
def _validate_distance_matrix(files, metadata, out_dir): """Validates a distance matrix artifact""" # Magic number [0] -> there is only one plain text file which is # the distance matrix dm_fp = files['plain_text'][0] dm = DistanceMatrix.read(dm_fp) # Get the ids of the distance matrix and the metadata dm_ids = set(dm.ids) metadata_ids = set(metadata) if not metadata_ids.issuperset(dm_ids): return (False, None, "The distance matrix contain samples not " "present in the metadata") filepaths = [(dm_fp, 'plain_text')] return True, [ArtifactInfo(None, 'distance_matrix', filepaths)], ""
def test_validate_q2_visualization(self): # Valid qzv obs_succes, obs_ainfo, obs_error = _validate_q2_visualization( {'qzv': [self.valid_qzv]}, self.out_dir) self.assertEqual(obs_error, "") self.assertTrue(obs_succes) exp_files = [(self.valid_qzv, 'qzv'), (join(self.out_dir, 'index.html'), 'html_summary'), (join(self.out_dir, 'support_files'), 'html_summary_dir')] exp_ainfo = [ArtifactInfo(None, 'q2_visualization', exp_files)] self.assertEqual(obs_ainfo, exp_ainfo) # Invalid qzv obs_succes, obs_ainfo, obs_error = _validate_q2_visualization( {'qzv': [self.invalid_qzv]}, self.out_dir) self.assertIn("Error loading Qiime 2 visualization:", obs_error) self.assertFalse(obs_succes) self.assertIsNone(obs_ainfo)
def _validate_feature_data(files, metadata, out_dir): # Magic number [0] -> there is only one plain text file, which is the # ordination results fdt = files['plain_text'][0] fdt_qza = None if 'qza' in files: fdt_qza = files['qza'][0] # basic header check to verify that it looks like a taxonomy file with open(fdt) as f: line = f.readline() if 'Tax' not in line or 'ID' not in line: return (False, None, 'The file header seems wrong "%s"' % line) filepaths = [(fdt, 'plain_text')] if fdt_qza is not None: filepaths.append((fdt_qza, 'qza')) return True, [ArtifactInfo(None, 'FeatureData', filepaths)], ""
def test_validate_no_changes(self): sample_ids = [ '1.SKB2.640194', '1.SKM4.640180', '1.SKB3.640195', '1.SKB6.640176', '1.SKD6.640190', '1.SKM6.640187', '1.SKD9.640182', '1.SKM8.640201', '1.SKM2.640199', '1.SKD2.640178', '1.SKB7.640196', '1.SKD4.640185', '1.SKB8.640193', '1.SKM3.640197', '1.SKD5.640186', '1.SKB1.640202', '1.SKM1.640183', '1.SKD1.640179', '1.SKD3.640198', '1.SKB5.640181', '1.SKB4.640189', '1.SKB9.640200', '1.SKM9.640192', '1.SKD8.640184', '1.SKM5.640177', '1.SKM7.640188', '1.SKD7.640191' ] biom_fp, job_id, parameters = self._create_job_and_biom(sample_ids, template=1) obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) self.assertTrue(obs_success) self.assertEqual(obs_ainfo, [ArtifactInfo(None, 'BIOM', [(biom_fp, 'biom')])]) self.assertEqual(obs_error, "")
def test_validate_analysis(self): sample_ids = [ '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', '1.SKM9.640192', '1.SKB7.640196' ] biom_fp, job_id, parameters = self._create_job_and_biom(sample_ids, analysis=1) obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) exp_fp = partial(join, self.out_dir) exp_index_fp = exp_fp('index.html') exp_viz_fp = exp_fp('support_files') self.assertTrue(obs_success) self.assertEqual(obs_ainfo, [ ArtifactInfo(None, 'BIOM', [(biom_fp, 'biom'), (exp_index_fp, 'html_summary'), (exp_viz_fp, 'html_summary_dir')]) ]) self.assertEqual(obs_error, "")
def test_validate_alpha_vector(self): # Create the alpha vector sample_ids = [ '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', '1.SKM9.640192' ] alpha_vector_fp = self._create_alpha_vector(sample_ids) # Test success obs_success, obs_ainfo, obs_error = _validate_alpha_vector( {'plain_text': [alpha_vector_fp]}, self.metadata, self.out_dir) self.assertEqual(obs_error, "") self.assertTrue(obs_success) exp_ainfo = [ ArtifactInfo(None, "alpha_vector", [(alpha_vector_fp, 'plain_text')]) ] self.assertEqual(obs_ainfo, exp_ainfo) # Test failure wrong ids sample_ids = [ '1.SKM4.640180', '1.SKB8.640193', '1.SKD8.640184', 'NotASample' ] alpha_vector_fp = self._create_alpha_vector(sample_ids) obs_success, obs_ainfo, obs_error = _validate_alpha_vector( {'plain_text': [alpha_vector_fp]}, self.metadata, self.out_dir) self.assertEqual( obs_error, "The alpha vector contains samples not " "present in the metadata") self.assertFalse(obs_success) self.assertIsNone(obs_ainfo) # Test failure wrong format fd, alpha_vector_fp = mkstemp(suffix='.txt', dir=self.out_dir) close(fd) with open(alpha_vector_fp, 'w') as f: f.write("\tobserved_otus\nsample 1\n") obs_success, obs_ainfo, obs_error = _validate_alpha_vector( {'plain_text': [alpha_vector_fp]}, self.metadata, self.out_dir) self.assertEqual(obs_error, "The alpha vector format is incorrect") self.assertFalse(obs_success) self.assertIsNone(obs_ainfo)
def test_validate_no_changes_superset(self): sample_ids = [ '1.SKB2.640194', '1.SKM4.640180', '1.SKB3.640195', '1.SKB6.640176', '1.SKD6.640190', '1.SKM6.640187', '1.SKD9.640182', '1.SKM8.640201', '1.SKM2.640199' ] biom_fp, job_id, parameters = self._create_job_and_biom(sample_ids, template=1) obs_success, obs_ainfo, obs_error = validate(self.qclient, job_id, parameters, self.out_dir) exp_fp = partial(join, self.out_dir) exp_index_fp = exp_fp('index.html') exp_viz_fp = exp_fp('support_files') self.assertTrue(obs_success) self.assertEqual(obs_ainfo, [ ArtifactInfo(None, 'BIOM', [(biom_fp, 'biom'), (exp_index_fp, 'html_summary'), (exp_viz_fp, 'html_summary_dir')]) ]) self.assertEqual(obs_error, "")
def _validate_q2_visualization(files, out_dir): # Magic number 0 -> there is only 1 qzv file qzv_fp = files['qzv'][0] # If the loader files this is not a correct Qiime 2 visualization. There # is no common exception raised, so we catch all of them try: q2vis = Visualization.load(qzv_fp) except Exception as e: return False, None, "Error loading Qiime 2 visualization: %s" % e # The visualization in Qiime 2 can contain multiple files and directories. # Adding all of them to Qiita can generate a very polluted GUI. In order # to improve that, we create a directory where all the files are added # (including the original index file) and we create a new index file with # an iframe that points to the old index file. This way, we are only adding # one HTML file and one directory to the Qiita filepaths. html_dir = join(out_dir, 'support_files') html_fp = join(out_dir, 'index.html') # Extract all the visualization files in the support_files. q2vis.export_data(html_dir) # Find the index paths. Only HTML based visualizations are currently # supported, since everything that we are doing is web based. Not sure # if there is any other type of visualizaion in Qiime 2 at this point, but # checking here will show a useful error in case that this occurs. index_paths = q2vis.get_index_paths() if 'html' not in index_paths: return (False, None, "Only Qiime 2 visualization with an html index are supported") index_name = basename(index_paths['html']) with open(html_fp, 'w') as f: f.write(Q2_INDEX % index_name) # We add the original qzv file so users can download it and play with it filepaths = [(qzv_fp, 'qzv'), (html_fp, 'html_summary'), (html_dir, 'html_summary_dir')] return True, [ArtifactInfo(None, 'q2_visualization', filepaths)], ""
def test_validate_SFF(self): prep_info = { "1.SKB2.640194": { "run_prefix": "GAX40" }, "1.SKM4.640180": { "run_prefix": "GAX40" }, "1.SKB3.640195": { "run_prefix": "GAX50" } } files = { 'raw_sff': [ '/path/to/GAX401.sff', '/path/to/GAX402.sff', '/path/to/GAX501.sff' ] } job_id, _ = self._create_template_and_job(prep_info, files, "SFF") obs_success, obs_ainfo, obs_error = _validate_multiple( self.qclient, job_id, prep_info, files, 'SFF') self.assertTrue(obs_success) filepaths = [('/path/to/GAX401.sff', 'raw_sff'), ('/path/to/GAX402.sff', 'raw_sff'), ('/path/to/GAX501.sff', 'raw_sff')] exp = [ArtifactInfo(None, "SFF", filepaths)] self.assertEqual(obs_ainfo, exp) self.assertEqual(obs_error, "") # let's test a failure files = {'raw_sff': ['/path/to/GAX401.sff', '/path/to/GAX402.sff']} job_id, _ = self._create_template_and_job(prep_info, files, "SFF") obs_success, obs_ainfo, obs_error = _validate_multiple( self.qclient, job_id, prep_info, files, 'SFF') error = ("Error creating artifact. Offending files:\nraw_sff: The " "following run prefixes in the prep information file do not " "match any file: GAX50") self.assertFalse(obs_success) self.assertIsNone(obs_ainfo) self.assertCountEqual(obs_error, error)