def test_unstrat_default_pipeline(self): '''Test running default pipeline on unstratified input table.''' with TemporaryDirectory() as temp_dir: unstrat_path_abun_df, unstrat_path_cov_df, strat_abun, strat_cov = pathway_pipeline(in_metagenome_unstrat, default_pathway_map, proc=1, out_dir=temp_dir, run_minpath=True, coverage=True, regroup_mapfile=default_regroup_map, gap_fill_on=True, per_sequence_contrib=False, print_cmds=False) # Compare these predicted tables to expected tables. exp_abun_unstrat = pd.read_csv(exp_abun_unstrat_file, sep="\t", index_col="pathway") exp_cov_unstrat = pd.read_csv(exp_cov_unstrat_file, sep="\t", index_col="pathway") pd.testing.assert_frame_equal(exp_abun_unstrat, unstrat_path_abun_df, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_cov_unstrat, unstrat_path_cov_df, check_like=True, check_less_precise=True)
def test_strat_default_pipeline(self): '''Test running strat_minpath default pipeline. Make sure that community wide stratified abundances are calculated correctly and that unstratified abundances are right. Note that wide-format stratified tables are tested for this test only.''' with TemporaryDirectory() as temp_dir: unstrat_pathabun, \ unstrat_pathcov, \ strat_pathabun, \ strat_pathcov, \ pathabun_by_seq, \ pathcov_by_seq, \ unstrat_pathabun_per_seq = pathway_pipeline(in_metagenome_strat, default_pathway_map, proc=1, out_dir=temp_dir, run_minpath=True, coverage=True, regroup_mapfile=default_regroup_map, gap_fill_on=True, per_sequence_contrib=False, verbose=True, wide_table=True) # Compare these predicted tables to expected tables. exp_abun_unstrat = pd.read_csv(exp_abun_unstrat_file, sep="\t", index_col="pathway") exp_cov_unstrat = pd.read_csv(exp_cov_unstrat_file, sep="\t", index_col="pathway") exp_abun_strat = pd.read_csv(exp_abun_strat_file, sep="\t") # Sort stratified files (different versions can sort the output # slightly differently). strat_pathabun.sort_values(['pathway', 'sequence'], inplace=True) exp_abun_strat.sort_values(['pathway', 'sequence'], inplace=True) # Reset index labels. exp_abun_strat.reset_index(drop=True, inplace=True) strat_pathabun.reset_index(drop=True, inplace=True) pd.testing.assert_frame_equal(exp_abun_unstrat, unstrat_pathabun, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_cov_unstrat, unstrat_pathcov, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_abun_strat, strat_pathabun, check_like=True, check_less_precise=True)
def test_strat_per_genome_pipeline_strat_input(self): '''Test running strat_minpath default pipeline. Make sure that per genome contributions are correct (per_sequence_contrib set). In this case the input is a stratified table.''' with TemporaryDirectory() as temp_dir: unstrat_pathabun, \ unstrat_pathcov, \ strat_pathabun, \ strat_pathcov, \ pathabun_by_seq, \ pathcov_by_seq, \ unstrat_pathabun_per_seq = pathway_pipeline(in_metagenome_strat_per_seq, default_pathway_map, proc=1, out_dir=temp_dir, run_minpath=True, coverage=True, regroup_mapfile=default_regroup_map, gap_fill_on=True, per_sequence_contrib=True, per_sequence_abun=in_per_seq_abun, per_sequence_function=in_per_seq_func, verbose=True) # Compare these predicted tables to expected tables. exp_abun_unstrat = pd.read_csv(exp_abun_unstrat_per_genome_file, sep="\t", index_col="pathway") exp_abun_unstrat_per_seq = pd.read_csv( exp_abun_unstrat_per_genome_per_seq_file, sep="\t", index_col="pathway") exp_cov_unstrat = pd.read_csv(exp_cov_unstrat_per_genome_file, sep="\t", index_col="pathway") exp_abun_strat = pd.read_csv(exp_abun_strat_per_genome_file, sep="\t") exp_cov_strat = pd.read_csv(exp_cov_strat_per_genome_file, sep="\t") # Sort stratified files (different versions can sort the output # slightly differently). strat_pathabun.sort_values(['function', 'taxon'], inplace=True) exp_abun_strat.sort_values(['function', 'taxon'], inplace=True) strat_pathcov.sort_values(['function', 'taxon'], inplace=True) exp_cov_strat.sort_values(['function', 'taxon'], inplace=True) # Reset index labels. exp_abun_strat.reset_index(drop=True, inplace=True) strat_pathabun.reset_index(drop=True, inplace=True) exp_cov_strat.reset_index(drop=True, inplace=True) strat_pathcov.reset_index(drop=True, inplace=True) pd.testing.assert_frame_equal(exp_abun_unstrat, unstrat_pathabun, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_abun_unstrat_per_seq, unstrat_pathabun_per_seq, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_cov_unstrat, unstrat_pathcov, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_abun_strat, strat_pathabun, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_cov_strat, strat_pathcov, check_like=True, check_less_precise=True)
def main(): args = parser.parse_args() # Check that input files exist. check_files_exist([args.input, args.map]) gap_fill_opt = not args.no_gap_fill run_minpath_opt = not args.skip_minpath # If intermediate output directory set then create and output there. # Otherwise make a temporary directory for the intermediate files. if args.intermediate: make_output_dir(args.intermediate) unstrat_abun, unstrat_cov, strat_abun, strat_cov = pathway_pipeline( inputfile=args.input, mapfile=args.map, regroup_mapfile=args.regroup_map, proc=args.proc, out_dir=args.intermediate, run_minpath=run_minpath_opt, coverage=args.coverage, gap_fill_on=gap_fill_opt, no_regroup=args.no_regroup, per_sequence_contrib=args.per_sequence_contrib, per_sequence_abun=args.per_sequence_abun, per_sequence_function=args.per_sequence_function, print_cmds=args.print_cmds) else: with TemporaryDirectory() as temp_dir: unstrat_abun, unstrat_cov, strat_abun, strat_cov = pathway_pipeline( inputfile=args.input, mapfile=args.map, regroup_mapfile=args.regroup_map, proc=args.proc, out_dir=temp_dir, run_minpath=run_minpath_opt, coverage=args.coverage, gap_fill_on=gap_fill_opt, no_regroup=args.no_regroup, per_sequence_contrib=args.per_sequence_contrib, per_sequence_abun=args.per_sequence_abun, per_sequence_function=args.per_sequence_function, print_cmds=args.print_cmds) make_output_dir(args.out_dir) # Write output files. unstrat_abun_outfile = path.join(args.out_dir, "path_abun_unstrat.tsv.gz") unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile, sep="\t", index_label="pathway", compression="gzip") if args.coverage: unstrat_cov_outfile = path.join(args.out_dir, "path_cov_unstrat.tsv.gz") unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile, sep="\t", index_label="pathway", compression="gzip") # Write stratified output only if something besides None was returned. if strat_abun is not None: strat_abun_outfile = path.join(args.out_dir, "path_abun_strat.tsv.gz") strat_abun.to_csv(path_or_buf=strat_abun_outfile, sep="\t", index=False, compression="gzip") if args.coverage and strat_cov is not None: strat_cov_outfile = path.join(args.out_dir, "path_cov_strat.tsv.gz") strat_cov.to_csv(path_or_buf=strat_cov_outfile, sep="\t", index=False, compression="gzip")
def main(): args = parser.parse_args() # Check that input files exist. check_files_exist([args.input, args.map]) gap_fill_opt = not args.no_gap_fill run_minpath_opt = not args.skip_minpath # If intermediate output directory set then create and output there. # Otherwise make a temporary directory for the intermediate files. if args.intermediate: make_output_dir(args.intermediate) unstrat_abun, \ unstrat_cov, \ strat_abun, \ strat_cov, \ path_abun_by_seq, \ path_cov_by_seq, \ unstrat_abun_per_seq = pathway_pipeline( inputfile=args.input, mapfile=args.map, regroup_mapfile=args.regroup_map, proc=args.processes, out_dir=args.intermediate, run_minpath=run_minpath_opt, coverage=args.coverage, gap_fill_on=gap_fill_opt, no_regroup=args.no_regroup, per_sequence_contrib=args.per_sequence_contrib, per_sequence_abun=args.per_sequence_abun, per_sequence_function=args.per_sequence_function, wide_table=args.wide_table, print_cmds=args.print_cmds) else: with TemporaryDirectory() as temp_dir: unstrat_abun, \ unstrat_cov, \ strat_abun, \ strat_cov, \ path_abun_by_seq, \ path_cov_by_seq, \ unstrat_abun_per_seq = pathway_pipeline( inputfile=args.input, mapfile=args.map, regroup_mapfile=args.regroup_map, proc=args.processes, out_dir=temp_dir, run_minpath=run_minpath_opt, coverage=args.coverage, gap_fill_on=gap_fill_opt, no_regroup=args.no_regroup, per_sequence_contrib=args.per_sequence_contrib, per_sequence_abun=args.per_sequence_abun, per_sequence_function=args.per_sequence_function, wide_table=args.wide_table, print_cmds=args.print_cmds) make_output_dir(args.out_dir) # Write output files. The unstratified abundance table will always be # written, but the other files will only be written if applicable. unstrat_abun_outfile = path.join(args.out_dir, "path_abun_unstrat.tsv.gz") unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile, sep="\t", index_label="pathway", compression="gzip") if args.coverage: unstrat_cov_outfile = path.join(args.out_dir, "path_cov_unstrat.tsv.gz") unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile, sep="\t", index_label="pathway", compression="gzip") if strat_abun is not None: if args.wide_table: strat_abun_outfile = path.join(args.out_dir, "path_abun_strat.tsv.gz") else: strat_abun_outfile = path.join(args.out_dir, "path_abun_contrib.tsv.gz") strat_abun.to_csv(path_or_buf=strat_abun_outfile, sep="\t", index=False, compression="gzip") if args.coverage and strat_cov is not None: if args.wide_table: strat_cov_outfile = path.join(args.out_dir, "path_cov_strat.tsv.gz") else: strat_cov_outfile = path.join(args.out_dir, "path_cov_contrib.tsv.gz") strat_cov.to_csv(path_or_buf=strat_cov_outfile, sep="\t", index=False, compression="gzip") if path_abun_by_seq is not None: genome_path_abun_outfile = path.join(args.out_dir, "path_abun_predictions.tsv.gz") path_abun_by_seq.to_csv(path_or_buf=genome_path_abun_outfile, sep="\t", index=True, compression="gzip", index_label="sequence") if args.coverage and path_cov_by_seq is not None: genome_path_cov_outfile = path.join(args.out_dir, "path_cov_predictions.tsv.gz") path_cov_by_seq.to_csv(path_or_buf=genome_path_cov_outfile, sep="\t", index=True, compression="gzip", index_label="sequence") if unstrat_abun_per_seq is not None: unstrat_abun_per_seq_outfile = path.join( args.out_dir, "path_abun_unstrat_per_seq.tsv.gz") unstrat_abun_per_seq.to_csv(path_or_buf=unstrat_abun_per_seq_outfile, sep="\t", index_label="pathway", compression="gzip")
def test_strat_default_pipeline(self): '''Test running strat_minpath default pipeline. Make sure that community wide stratified abundances are calculated correctly and that unstratified abundances are right.''' with TemporaryDirectory() as temp_dir: unstrat_path_abun_df, unstrat_path_cov_df, strat_path_abun_df, strat_cov = pathway_pipeline(in_metagenome_strat, default_pathway_map, proc=1, out_dir=temp_dir, run_minpath=True, coverage=True, regroup_mapfile=default_regroup_map, gap_fill_on=True, per_sequence_contrib=False, print_cmds=False) # Compare these predicted tables to expected tables. exp_abun_unstrat = pd.read_csv(exp_abun_unstrat_file, sep="\t", index_col="pathway") exp_cov_unstrat = pd.read_csv(exp_cov_unstrat_file, sep="\t", index_col="pathway") exp_abun_strat = pd.read_csv(exp_abun_strat_file, sep="\t") # Sort stratified files (different versions can sort the output # slightly differently). strat_path_abun_df.sort_values(['pathway', 'sequence'], inplace=True) exp_abun_strat.sort_values(['pathway', 'sequence'], inplace=True) # Reset index labels. exp_abun_strat.reset_index(drop=True, inplace=True) strat_path_abun_df.reset_index(drop=True, inplace=True) pd.testing.assert_frame_equal(exp_abun_unstrat, unstrat_path_abun_df, check_like=True, check_less_precise=True) pd.testing.assert_frame_equal(exp_cov_unstrat, unstrat_path_cov_df, check_like=True, check_less_precise=True) # Check with less precision here since the HUMAnN2 output that is used # as expected abundances are not rounded. pd.testing.assert_frame_equal(exp_abun_strat, strat_path_abun_df, check_like=True, check_less_precise=True)