def test_rare_2_samp(self): '''Check that correct sequences are identified as rare when a cut-off of 2 samples is used.''' seqtab_in = biom_to_pandas_df(biom.load_table(seqtab_biom)) rare_seqs = id_rare_seqs(seqtab_in, 1, 2) self.assertSetEqual(set(rare_seqs), set(["2558860574", "2571042244"]))
def test_rare_4_reads(self): '''Check that correct sequences are identified as rare when a cut-off of 4 reads is used.''' seqtab_in = biom_to_pandas_df(biom.load_table(seqtab_biom)) rare_seqs = id_rare_seqs(seqtab_in, 4, 1) self.assertSetEqual(set(rare_seqs), set(["2558860574", "extra"]))
def test_norm_by_marker_copies(self): '''Test that expected normalized sequence abundance table generated.''' seqtab_in = biom_to_pandas_df(biom.load_table(seqtab_biom)) # Get output index labels in same order as expected. seqtab_in = seqtab_in.reindex(exp_norm_in.index) test_norm = norm_by_marker_copies(input_seq_counts=seqtab_in, input_marker_num=marker_predict_in, norm_filename=None) # Test whether normalized table matches expected table. pd.testing.assert_frame_equal(test_norm, exp_norm_in)
def main(): parser = argparse.ArgumentParser( description="Creates output FASTA for each sample with each ASV repeated for every count in that sample.", formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-f", "--fasta", metavar="FASTA", type=str, help="Path to full FASTA file.", required=True) parser.add_argument("-b", "--biom", metavar="BIOM", type=str, help="Path to BIOM table.", required=True) parser.add_argument("-o", "--outdir", metavar="PATH", type=str, help="Name of folder to make for output files.", required=True) args = parser.parse_args() in_fasta = read_fasta(args.fasta) in_table = biom_to_pandas_df(biom.load_table(args.biom)) # If no sequences in file then stop job. if not in_fasta: sys.exit("Stopping - no sequences in file.") make_output_dir(args.outdir) for sample in in_table.columns: sample_outfile = args.outdir + "/" + sample + ".fasta" sample_outfh = open(sample_outfile, 'wt') for asv in in_table.index.values: asv_count = in_table.loc[asv, sample] if asv_count > 0: for i in range(int(asv_count)): print(">" + asv + "_" + sample + "_" + str(i), file=sample_outfh) print(in_fasta[asv], file=sample_outfh) sample_outfh.close()
def run_metagenome_pipeline(input_biom, function, marker, max_nsti, out_dir='metagenome_out', proc=1, output_normfile=False): '''Main function to run full metagenome pipeline. Meant to run modular functions largely listed below. Will return predicted metagenomes straitifed and unstratified by contributing genomes (i.e. taxa).''' # Read in input table of sequence abundances and convert to pandas df. study_seq_counts = biom_to_pandas_df(biom.load_table(input_biom)) # Read in predicted function and marker gene abundances. pred_function = pd.read_table(function, sep="\t", index_col="sequence") pred_marker = pd.read_table(marker, sep="\t", index_col="sequence") pred_function.index = pred_function.index.astype(str) pred_marker.index = pred_marker.index.astype(str) # Initialize empty pandas dataframe to contain NSTI values. nsti_val = pd.DataFrame() # If NSTI column present then remove all rows with value above specified # max value. Also, remove NSTI column (in both dataframes). if "metadata_NSTI" in pred_function.columns: pred_function = pred_function[pred_function['metadata_NSTI'] <= max_nsti] nsti_val = pred_function[['metadata_NSTI']] pred_function.drop('metadata_NSTI', axis=1, inplace=True) if "metadata_NSTI" in pred_marker.columns: pred_marker = pred_marker[pred_marker['metadata_NSTI'] <= max_nsti] nsti_val = pred_marker[['metadata_NSTI']] pred_marker.drop('metadata_NSTI', axis=1, inplace=True) # Re-order predicted abundance tables to be in same order as study seqs. # Also, drop any sequence ids that don't overlap across all dataframes. study_seq_counts, pred_function, pred_marker = three_df_index_overlap_sort(study_seq_counts, pred_function, pred_marker) # Create output directory if it does not already exist. make_output_dir(out_dir) # Create normalized sequence abundance filename if outfile specified. if output_normfile: norm_output = path.join(out_dir, "seqtab_norm.tsv") else: norm_output = None # Normalize input study sequence abundances by predicted abundance of # marker genes and output normalized table if specified. study_seq_counts = norm_by_marker_copies(input_seq_counts=study_seq_counts, input_marker_num=pred_marker, norm_filename=norm_output) # If NSTI column input then output weighted NSTI values. if not nsti_val.empty: weighted_nsti_out = path.join(out_dir, "weighted_nsti.tsv") weighted_nsti = calc_weighted_nsti(seq_counts=study_seq_counts, nsti_input=nsti_val, outfile=weighted_nsti_out) # Get predicted function counts by sample, stratified by contributing # genomes and also separately unstratified. return(funcs_by_sample(input_seq_counts=study_seq_counts, input_function_num=pred_function, proc=proc))