def test_standalone_rpca(self): """Checks the output produced by DEICODE's standalone script. This is more of an "integration test" than a unit test -- the details of the algorithm used by the standalone RPCA script are checked in more detail in deicode/tests/test_optspace.py, etc. """ in_ = get_data_path('test.biom') out_ = os_path_sep.join(in_.split(os_path_sep)[:-1]) runner = CliRunner() result = runner.invoke(sdc.commands['rpca'], ['--in-biom', in_, '--output-dir', out_]) # Read the results dist_res = pd.read_csv(get_data_path('distance-matrix.tsv'), sep='\t', index_col=0) ord_res = OrdinationResults.read(get_data_path('ordination.txt')) # Read the expected results dist_exp = pd.read_csv(get_data_path('expected-distance-matrix.tsv'), sep='\t', index_col=0) ord_exp = OrdinationResults.read(get_data_path( 'expected-ordination.txt')) # Check that the distance matrix matches our expectations assert_array_almost_equal(dist_res.values, dist_exp.values) # Check that the ordination results match our expectations -- checking # each value for both features and samples assert_deicode_ordinationresults_equal(ord_res, ord_exp) # Lastly, check that DEICODE's exit code was 0 (indicating success) self.assertEqual(result.exit_code, 0)
def test_from_seralized_results(self): # the current implementation of ordination results loses some # information, test that pcoa_biplot works fine regardless results = OrdinationResults.read(get_data_path('PCoA_skbio')) serialized = pcoa_biplot(results, self.descriptors) in_memory = pcoa_biplot(self.ordination, self.descriptors) assert_ordination_results_equal(serialized, in_memory, ignore_directionality=True, ignore_axis_labels=True, ignore_method_names=True)
def test_standalone_rpca_rank_est(self): """Checks the standalone rank estimate is used instead of a explicit rank setting. """ in_ = get_data_path('test.biom') out_ = os_path_sep.join(in_.split(os_path_sep)[:-1]) runner = CliRunner() result = runner.invoke(sdc.commands['auto-rpca'], ['--in-biom', in_, '--output-dir', out_]) # Read the results dist_res = pd.read_csv(get_data_path('distance-matrix.tsv'), sep='\t', index_col=0) ord_res = OrdinationResults.read(get_data_path('ordination.txt')) # Read the expected results file_ = 'expected-est-distance-matrix.tsv' dist_exp = pd.read_csv(get_data_path(file_), sep='\t', index_col=0) ord_exp = OrdinationResults.read(get_data_path( 'expected-est-ordination.txt')) # Check that the distance matrix matches our expectations assert_array_almost_equal(dist_res.values, dist_exp.values) # Check that the ordination results match our expectations -- checking # each value for both features and samples assert_deicode_ordinationresults_equal(ord_res, ord_exp) # Lastly, check that DEICODE's exit code was 0 (indicating success) try: self.assertEqual(0, result.exit_code) except AssertionError: ex = result.exception error = Exception('Command failed with non-zero exit code') raise error.with_traceback(ex.__traceback__)
def _generate_ordination_results_summary(files, metadata, out_dir): # Magic number [0] -> there is only one plain text file and it is the # ordination results ord_res = OrdinationResults.read(files['plain_text'][0]) md_df = pd.DataFrame.from_dict(metadata, orient='index') emp = Emperor(ord_res, md_df, remote="emperor_support_files") html_summary_fp = join(out_dir, 'index.html') esf_dp = join(out_dir, 'emperor_support_files') makedirs(esf_dp) with open(html_summary_fp, 'w') as f: f.write(emp.make_emperor(standalone=True)) emp.copy_support_files(esf_dp) return html_summary_fp, esf_dp
def test_standalone_rpca_rank_est(self): """Checks the standalone RPCA rank estimate is used instead of a explicit rank setting. """ in_ = get_data_path('test.biom', subfolder='rpca_data') out_ = os_path_sep.join(in_.split(os_path_sep)[:-1]) runner = CliRunner() result = runner.invoke(sdc.commands['auto-rpca'], ['--in-biom', in_, '--output-dir', out_]) # Read the results dist_res = pd.read_csv(get_data_path('distance-matrix.tsv', subfolder='rpca_data'), sep='\t', index_col=0) ord_res = OrdinationResults.read( get_data_path('ordination.txt', subfolder='rpca_data')) # Read the expected results file_ = 'expected-est-distance-matrix.tsv' dist_exp = pd.read_csv(get_data_path(file_, subfolder='rpca_data'), sep='\t', index_col=0) ord_exp = OrdinationResults.read( get_data_path('expected-est-ordination.txt', subfolder='rpca_data')) # Check that the distance matrix matches our expectations assert_array_almost_equal(dist_res.values, dist_exp.values) # Check that the ordination results match our expectations -- checking # each value for both features and samples assert_ordinationresults_equal(ord_res, ord_exp) # Lastly, check that gemelli's exit code was 0 (indicating success) CliTestCase().assertExitCode(0, result)
def test_standalone_rpca_n_components(self): """Tests the standalone script when n_components is 2 """ in_ = get_data_path('test.biom') out_ = os_path_sep.join(in_.split(os_path_sep)[:-1]) runner = CliRunner() # run the same command but with rank==2 result = runner.invoke(standalone_rpca, [ '--in-biom', in_, '--output-dir', out_, '--n_components', 2, '--max_iterations', 5 ]) self.assertEqual(result.exit_code, 0) ord_res = OrdinationResults.read(get_data_path('ordination.txt')) # check it contains three axis if len(ord_res.proportion_explained) == 3: pass
def _simulation_data(data, ids): with open("ordination.txt","w", encoding='utf8') as ordination: ordination.write('Eigvals\t0'+'\n\n') ordination.write('Proportion explained\t0'+'\n\n') ordination.write('Species\t0\t0\n\n') ordination.write('Site\t'+str(len(data)*len(data[0][0]))+'\t3\n') dm = {} j=0 for row in data: identifier = ids[j] for i in range(len(row[0])): ordination.write(str(identifier)+"_t"+str(i)+"\t"+str(row[0][i])+"\t"+str(row[1][i])+"\t"+str(row[2][i])+"\n") dm.update({str(identifier)+"."+str(i):[row[0][i],row[1][i],row[2][i]]}) j+=1 ordination.write("\n") ordination.write("Biplot\t0\t0\n\n") ordination.write("Site constraints\t0\t0\n") ordination_results = OrdinationResults.read("ordination.txt") ordination.close os.remove("ordination.txt") # Distance matrix (euclidean) dm_0 = [] dm_0.append("") distance_matrix = [] for key in dm.keys(): dm_0.append(key) distance_matrix.append(dm_0) for key in dm.keys(): dm_1 = [] dm_1.append(key) for key1 in dm.keys(): dm_1.append(str(distance.euclidean(dm[key],dm[key1]))) distance_matrix.append(dm_1) #Mapping file md_0 = ["#SampleID","Subject","Treatment","Timepoint"] md_1 = ["#q2:types","categorical","categorical","numeric"] md = [] for id in ids: for i in range(len(data[0][0])): md.append([id+"_t"+str(i),id,''.join([k for k in id if not k.isdigit()])[:-1],i]) metadata = [md_0,md_1] for row in md: metadata.append(row) #ADD FUNCTIONALITY TO RETURN MAPPING FILE return ordination_results, distance_matrix
def _validate_ordination_results(files, metadata, out_dir): # Magic number [0] -> there is only one plain text file, which is the # ordination results ord_res_fp = files['plain_text'][0] ord_res = OrdinationResults.read(ord_res_fp) # Get the ids of the ordination results and the metadata ord_res_ids = set(ord_res.samples.index) metadata_ids = set(metadata) if not metadata_ids.issuperset(ord_res_ids): return (False, None, "The ordination results contain samples not " "present in the metadata") filepaths = [(ord_res_fp, 'plain_text')] return True, [ArtifactInfo(None, 'ordination_results', filepaths)], ""
def test_standalone_rpca_n_components(self): """Tests the standalone RPCA script when n_components is 2 """ in_ = get_data_path('test.biom', subfolder='rpca_data') out_ = os_path_sep.join(in_.split(os_path_sep)[:-1]) runner = CliRunner() # run the same command but with rank==2 result = runner.invoke(sdc.commands['rpca'], [ '--in-biom', in_, '--output-dir', out_, '--n_components', 2, '--max_iterations', 5 ]) CliTestCase().assertExitCode(0, result) ord_res = OrdinationResults.read( get_data_path('ordination.txt', subfolder='rpca_data')) # check it contains three axis if len(ord_res.proportion_explained) == 3: pass
def setUp(self): or_f = StringIO(PCOA_STRING) self.ord_res = OrdinationResults.read(or_f) data = \ [['PC.354', 'Control', '20061218', 'Ctrol_mouse_I.D._354'], ['PC.355', 'Control', '20061218', 'Control_mouse_I.D._355'], ['PC.356', 'Control', '20061126', 'Control_mouse_I.D._356'], ['PC.481', 'Control', '20070314', 'Control_mouse_I.D._481'], ['PC.593', 'Control', '20071210', 'Control_mouse_I.D._593'], ['PC.607', 'Fast', '20071112', 'Fasting_mouse_I.D._607'], ['PC.634', 'Fast', '20080116', 'Fasting_mouse_I.D._634'], ['PC.635', 'Fast', '20080116', 'Fasting_mouse_I.D._635'], ['PC.636', 'Fast', '20080116', 'Fasting_mouse_I.D._636']] headers = ['SampleID', 'Treatment', 'DOB', 'Description'] self.mf = pd.DataFrame(data=data, columns=headers) self.mf.set_index('SampleID', inplace=True)
def test_qiime2_rpca(self): """Tests that the Q2 and standalone RPCA results match.""" tstdir = "test_output" # Run DEICODE through QIIME 2 (specifically, the Artifact API) ordination_qza, distmatrix_qza = q2deicode.actions.rpca(self.q2table) # Get the underlying data from these artifacts q2ordination = ordination_qza.view(OrdinationResults) q2distmatrix = distmatrix_qza.view(DistanceMatrix) # Next, run DEICODE outside of QIIME 2. We're gonna check that # everything matches up. # ...First, though, we need to write the contents of self.q2table to a # BIOM file, so DEICODE can understand it. self.q2table.export_data(get_data_path("", tstdir)) q2table_loc = get_data_path('feature-table.biom', tstdir) # Derived from a line in test_standalone_rpca() tstdir_absolute = os_path_sep.join(q2table_loc.split(os_path_sep)[:-1]) # Run DEICODE outside of QIIME 2... CliRunner().invoke(standalone_rpca, ['--in-biom', q2table_loc, '--output-dir', tstdir_absolute]) # ...and read in the resulting output files. This code was derived from # test_standalone_rpca() elsewhere in DEICODE's codebase. stordination = OrdinationResults.read(get_data_path('ordination.txt', tstdir)) stdistmatrix_values = read_csv( get_data_path( 'distance-matrix.tsv', tstdir), sep='\t', index_col=0).values # Convert the DistanceMatrix object a numpy array (which we can compare # with the other _values numpy arrays we've created from the other # distance matrices) q2distmatrix_values = q2distmatrix.to_data_frame().values # Finaly: actually check the consistency of Q2 and standalone results! np.testing.assert_array_almost_equal(q2distmatrix_values, stdistmatrix_values)
def create_emperor_visual(args, pcfile): """ Sample .pc file # Eigvals 4 # 0.2705559825337763 0.07359266496720843 0.02997793703738496 0.0 # # Proportion explained 4 # 0.7231669539538659 0.19670525434062255 0.0801277917055116 0.0 # # Species 0 0 # # Site 4 4 # ICM_LCY_Bv6--LCY_0001_2003_05_11 -0.04067063044757823 -0.09380781760926289 0.13680474645584195 0.0 # ICM_LCY_Bv6--LCY_0003_2003_05_04 -0.11521436634022217 -0.15957409396683217 -0.10315005726535573 0.0 # ICM_LCY_Bv6--LCY_0005_2003_05_16 0.4268532792747924 0.06657577342833808 -0.02212569426459717 0.0 # ICM_LCY_Bv6--LCY_0007_2003_05_04 -0.2709682824869916 0.18680613814775715 -0.011528994925888972 0.0 # # Biplot 0 0 # # Site constraints 0 0 """ #print PCoA_result from emperor import Emperor from skbio import OrdinationResults #load metadata mf = load_mf(args.map_fp) # must read from file (scikit-bio version 0.5.1 http://scikit-bio.org/docs/0.5.1/generated/generated/skbio.stats.ordination.OrdinationResults.html res = OrdinationResults.read(pcfile) emp = Emperor(res, mf) #pcoa_outdir = os.path.join(args.basedir,'views', 'tmp',args.prefix+'_pcoa3d') pcoa_outdir = os.path.join(args.basedir, args.prefix + '_pcoa3d') print('OUT?', pcoa_outdir, args.basedir) os.makedirs(pcoa_outdir, mode=0o777, exist_ok=True) with open(os.path.join(pcoa_outdir, 'index.html'), 'w') as f: f.write(emp.make_emperor(standalone=True)) emp.copy_support_files(pcoa_outdir)
def create_emperor_visual(args, pcfile): """ Sample .pc file # Eigvals 4 # 0.2705559825337763 0.07359266496720843 0.02997793703738496 0.0 # # Proportion explained 4 # 0.7231669539538659 0.19670525434062255 0.0801277917055116 0.0 # # Species 0 0 # # Site 4 4 # ICM_LCY_Bv6--LCY_0001_2003_05_11 -0.04067063044757823 -0.09380781760926289 0.13680474645584195 0.0 # ICM_LCY_Bv6--LCY_0003_2003_05_04 -0.11521436634022217 -0.15957409396683217 -0.10315005726535573 0.0 # ICM_LCY_Bv6--LCY_0005_2003_05_16 0.4268532792747924 0.06657577342833808 -0.02212569426459717 0.0 # ICM_LCY_Bv6--LCY_0007_2003_05_04 -0.2709682824869916 0.18680613814775715 -0.011528994925888972 0.0 # # Biplot 0 0 # # Site constraints 0 0 """ #print PCoA_result from emperor import Emperor from skbio import OrdinationResults #load metadata mf = load_mf(args.map_fp) # must read from file (scikit-bio version 0.5.1 http://scikit-bio.org/docs/0.5.1/generated/generated/skbio.stats.ordination.OrdinationResults.html res = OrdinationResults.read(pcfile) emp = Emperor(res, mf) pcoa_outdir = os.path.join(args.basedir,'views', 'tmp',args.prefix+'_pcoa3d') print('OUT?',pcoa_outdir,args.basedir) os.makedirs(pcoa_outdir, exist_ok=True) with open(os.path.join(pcoa_outdir, 'index.html'), 'w') as f: f.write(emp.make_emperor(standalone=True)) emp.copy_support_files(pcoa_outdir)
def main(arguments): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent( '''additional information: output of taxsum, betadiv and pcoa are written in STDOUT If you use microbiomeutils in your work, please cite: Tremblay, Julien microbiomeutils 0.9 : Microbiome utilities https://github.com/jtremblay/microbiomeutils Thank you.''')) subparsers = parser.add_subparsers(title='subcommands', description='valid subcommands', help='additional help', dest="command") parser_bd = subparsers.add_parser('betadiv') parser_bd.add_argument('-i', '--infile-feature-table', help="Input file", type=argparse.FileType('r')) parser_bd.add_argument("-m", "--metric", help="Diversity metric (default: bray-curtis)", choices=["bray-curtis", "weighted-unifrac"], default="bray-curtis") parser_bd.add_argument("-t", "--infile-tree", help="Tree file (for weighted uniFrac)", type=argparse.FileType('r')) #parser_bd.set_defaults(func=betadiv) parser_ts = subparsers.add_parser('taxsum') parser_ts.add_argument('-i', '--infile-feature-table', help="Input file", type=argparse.FileType('r')) parser_ts.add_argument("-t", "--sumtype", help="Summary type (default: absolute)", choices=["absolute", "relative"], default="absolute") parser_ts.add_argument("-l", "--level", help="Level <int> 1 to 7", choices=["1", "2", "3", "4", "5", "6", "7", "8"], default="3") #parser_bd.set_defaults(func=taxsum) parser_ts = subparsers.add_parser('pcoa') parser_ts.add_argument('-i', '--infile-distance-matrix', help="Input file", type=argparse.FileType('r')) parser_ts = subparsers.add_parser('emperor') parser_ts.add_argument('-i', '--infile-coords', help="Input file", type=argparse.FileType('r')) parser_ts.add_argument('-m', '--mapping-file', help="Mapping file", type=argparse.FileType('r')) parser_ts.add_argument('-o', '--outdir', help="Output directory") args = parser.parse_args(arguments) if args.command == 'betadiv': infile_feature_table = os.path.abspath(args.infile_feature_table.name) sys.stderr.write("[betadiv]\n") if args.infile_tree is None and args.metric == "weighted-unifrac": raise ValueError( 'weighted-unifrac needs a tree supplied. --infile-tree needed') if args.metric == "bray-curtis": betadiv(infile_feature_table, args.metric) else: betadiv(infile_feature_table, args.metric, args.infile_tree.name) elif args.command == 'taxsum': infile_feature_table = os.path.abspath(args.infile_feature_table.name) sys.stderr.write("[taxsum]\n") taxsum(infile_feature_table, args.sumtype, args.level) elif args.command == 'pcoa': sys.stderr.write("[pcoa]\n") infile_distance_matrix = os.path.abspath( args.infile_distance_matrix.name) ord_res = do_pcoa(infile_distance_matrix) elif args.command == 'emperor': sys.stderr.write("[emperor]\n") metadata = pd.read_csv(args.mapping_file, sep='\t', index_col='#SampleID', dtype={'#SampleID': 'string'}) ordination = OrdinationResults.read(args.infile_coords) # the remote argument refers to where the support files will be located # relative to the plot itself i.e. index.html. emp = Emperor(ordination, metadata, remote='.') output_folder = args.outdir # new folder where data will be saved # create an output directory os.makedirs(output_folder, exist_ok=True) with open(os.path.join(output_folder, 'index.html'), 'w') as f: f.write(emp.make_emperor(standalone=True)) emp.copy_support_files(output_folder)
def setUp(self): self.test_matrix = OrdinationResults.read( get_data_path('unweighted_unifrac_pc.txt'))