def test_determine_cases_and_controls(unit, normal_input, normal_output, normal_control, normal_case, noentries_case, empty_file): ''' Tests the programs labeling of case and control samples Parameters ---------- verbose: boolean Tells function if it should output print statements or not. True outputs print statements. unit: string Location of the folder that holds the unit test files normal_input: string Name of the tsv file that will be loaded into a metadata object. This object will be used to test the case and control labeling functionality of the match_functions.py normal_output: string Name of the tsv file that will be loaded into a metadata object. This object will be used to test that valid case and control inputs get the correct output normal_control: string Name of the query file that contains the normal control query using IN normal_case: string Name of the query file that contains the normal case query noentries_case: string Name of the query file that results in no cases being found. This tests the that filtering everything out gives an error. empty_file: string Name of the empty file used in various tests ''' extra = False norm_in = Metadata.load("./%s/%s" % (unit, normal_input)) norm_out = Metadata.load("./%s/%s" % (unit, normal_output)) norm_case = open("./%s/%s" % (unit, normal_case), "r").read().splitlines() norm_control = open("./%s/%s" % (unit, normal_control), "r").read().splitlines() noentry_case = open("./%s/%s" % (unit, noentries_case), "r").read().splitlines() emp_file = open("./%s/%s" % (unit, empty_file), "r").read().splitlines() case_control_dict = {"case": norm_case, "control": norm_control} unit_norm_out = match_functions.determine_cases_and_controls( norm_in, case_control_dict, extra) case_control_dict = {"case": emp_file, "control": emp_file} assert_raises(ValueError, match_functions.determine_cases_and_controls, norm_in, case_control_dict, extra) case_control_dict = {"case": noentry_case, "control": norm_control} assert_raises(ValueError, match_functions.determine_cases_and_controls, norm_in, case_control_dict, extra) norm_out = norm_out.to_dataframe() unit_norm_out = unit_norm_out.to_dataframe() assert_frame_equal(norm_out, unit_norm_out)
def test_keep_samples(unit, normal_input, normal_output, normal_keep, noentries_keep, empty_file): ''' Test the programs filtering out of unwanted samples based on sql queries Parameters ---------- verbose: boolean Tells function if it should output print statements or not. True outputs print statements. unit: string Location of the folder that holds the unit test files normal_input: string Name of the tsv file that will be loaded into a metadata object. This object will be used to test the keeping of samples functionality of the match_functions.py normal_output: string Name of the tsv file that will be loaded into a metadata object. This object will be used to test that valid inputs for keeping samples gets the correct output normal_keep: string Name of the query file that contains the sql queries to do a normal keep noentries_keep: string Name of the query file that contains the sql queries that keeps not samples empty_file: string Name of the empty file used in various tests ''' extra = False norm_in = Metadata.load("./%s/%s" % (unit, normal_input)) norm_out = Metadata.load("./%s/%s" % (unit, normal_output)) norm_keep = open("./%s/%s" % (unit, normal_keep), "r").read().splitlines() noentry_keep = open("./%s/%s" % (unit, noentries_keep), "r").read().splitlines() emp_file = open("./%s/%s" % (unit, empty_file), "r").read().splitlines() unit_norm_out = match_functions.keep_samples(norm_in, norm_keep, extra) assert_raises(ValueError, match_functions.keep_samples, norm_in, emp_file, extra) assert_raises(ValueError, match_functions.keep_samples, norm_in, noentry_keep, extra) unit_norm_out = unit_norm_out.to_dataframe() norm_out = norm_out.to_dataframe() assert_frame_equal(norm_out, unit_norm_out)
def setUp(self): super().setUp() self.preprocess = self.plugin.pipelines['preprocess'] continuous_metadata = pd.DataFrame( { 'target': ['1.0', '2.0', '3.0', '4.0'], 'contain_nan': ['3.3', '3.5', None, '3.9'] }, index=pd.Index(['A', 'B', 'C', 'D'], name='id')) self.continuous_metadata = continuous_metadata discrete_metadata = pd.DataFrame( { 'target': ['0', '1', '0', '1'], 'target_int': [1, 0, 1, 0], 'contain_nan': ['0', '1', None, '1'], 'non_encoded': ['10', '2', '', 'b'] }, index=pd.Index(['A', 'B', 'C', 'D'], name='id')) self.discrete_metadata = discrete_metadata TEST_DIR = path.split(__file__)[0] md_path = path.join(TEST_DIR, 'data/sample-metadata-binary.tsv') table_path = path.join(TEST_DIR, 'data/table.qza') rooted_tree_path = path.join(TEST_DIR, 'data/rooted-tree.qza') unrooted_tree_path = path.join(TEST_DIR, 'data/unrooted-tree.qza') self.mp_sample_metadata = Metadata.load(md_path) self.mp_table = Artifact.load(table_path) self.mp_rooted_tree = Artifact.load(rooted_tree_path) self.mp_unrooted_tree = Artifact.load(unrooted_tree_path)
def test_primitive_passed_incorrectly(self): concatenate_ints = self.plugin.methods['concatenate_ints'] identity_with_metadata = self.plugin.methods['identity_with_metadata'] params_only_method = self.plugin.methods['params_only_method'] md_fp = get_data_path('valid/simple.tsv') inappropriate_metadata = Metadata.load(md_fp) ints1 = Artifact.import_data(IntSequence1, [0, 42, 43]) ints3 = Artifact.import_data(IntSequence1, [12, 111]) int1 = 4 int2 = 5 arbitrary_int = 43 # tests primitive int passed as IntSequence artifact with self.assertRaisesRegex(TypeError, 'ints2.*43.*incompatible.*IntSequence1'): concatenate_ints(ints1, arbitrary_int, ints3, int1, int2) # tests primitive passed as metadata with self.assertRaisesRegex(TypeError, 'metadata.*43.*incompatible.*Metadata'): identity_with_metadata(ints1, arbitrary_int) # tests wrong type of primitive passed with self.assertRaisesRegex(TypeError, 'age.*arbitraryString.*incompatible.*Int'): params_only_method('key string', 'arbitraryString') # tests metadata passed as artifact with self.assertRaisesRegex(TypeError, '\'ints2\'.*Metadata.*IntSequence1'): concatenate_ints(ints1, inappropriate_metadata, ints3, int1, int2)
def merge_metadata(metadata, output): dfs = [] for file in metadata: dfs.append(Metadata.load(file).to_dataframe()) Metadata(pd.concat(dfs)).save(output)
def get_itol_barchart(fdata: pd.DataFrame, table_file: str, metadata_file: str, metadata_column: str, output_file: str): '''Generate a table in QIIME 2 artifact format which can be directly parsed by iTOL and yield a multi-bar chart. ''' # load sample feature table table = Artifact.load(table_file) # extract BIOM table table = table.view(biom.Table) # load sample metadata meta = Metadata.load(metadata_file) # generate a sample Id to category map column = meta.get_column(metadata_column).drop_missing_values() catmap = column.to_series().to_dict() # collapse feature table by category # note: when multiple samples map to one category, take **mean** table = table.collapse(lambda i, _: catmap[i], norm=True, axis='sample') # import BIOM table into QIIME 2 and save res = Artifact.import_data('FeatureTable[Frequency]', table) res.save(output_file)
def add_metadata(metadata, columns, output): mf1 = Metadata.load(metadata).to_dataframe() index_name = mf1.index.name dtypes = mf1.dtypes.to_dict() mf2 = pd.read_table(columns, keep_default_na=False) for k, v in dtypes.items(): if k in mf2.columns: if v == 'object': mf2[k] = mf2[k].astype(str) else: mf2[k] = mf2[k].astype(v) mf3 = mf1.reset_index().merge(mf2).set_index(index_name) a = mf1.shape[0] b = mf3.shape[0] if a != b: message = (f"Final metadata (N={b}) has different number of samples " f"than input metadata (N={a}). Please double check " "whether this was intended.") warnings.warn(message) Metadata(mf3).save(output)
def load_mp_data(): """Loads data from the QIIME 2 moving pictures tutorial for visualization. It's assumed that this data is already stored in docs/moving-pictures/, aka the PREFIX_DIR global variable set above, which should be located relative to where this function is being run from. If this directory or the data files within it cannot be accessed, this function will (probably) break. Returns ------- (tree, table, md, fmd, ordination) tree: Artifact with semantic type Phylogeny[Rooted] Phylogenetic tree. table: Artifact with semantic type FeatureTable[Frequency] Feature table. md: Metadata Sample metadata. fmd: Metadata Feature metadata. (Although this is stored in the repository as a FeatureData[Taxonomy] artifact, we transform it to Metadata.) pcoa: Artifact with semantic type PCoAResults Ordination. """ tree = Artifact.load(os.path.join(PREFIX_DIR, "rooted-tree.qza")) table = Artifact.load(os.path.join(PREFIX_DIR, "table.qza")) pcoa = Artifact.load( os.path.join(PREFIX_DIR, "unweighted_unifrac_pcoa_results.qza") ) md = Metadata.load(os.path.join(PREFIX_DIR, "sample_metadata.tsv")) # We have to transform the taxonomy QZA to Metadata ourselves taxonomy = Artifact.load(os.path.join(PREFIX_DIR, "taxonomy.qza")) fmd = taxonomy.view(Metadata) return tree, table, md, fmd, pcoa
def _load_q2_metadata(metadata_path, name): try: new_resource = Metadata.load(metadata_path) except TypeError: # if metadata_path is some type that does not have '+' method with # str, e.g., dict then q2 metadata will get a type error. Except this # error and give a MetadataFileError, which is more informative raise MetadataFileError(str(metadata_path)) return new_resource.to_dataframe()
def setup(feature_table, sample_metadata): try: Artifact.load(feature_table) except Exception as e: raise ValueError(e) try: Metadata.load(sample_metadata) except Exception as e: raise ValueError(e) os.mkdir("pbs_out") with fileinput.input("config/config.yaml", inplace=True) as f: for line in f: if f.filelineno() == 1: print(f"feature_table: {feature_table}") elif f.filelineno() == 2: print(f"sample_metadata: {sample_metadata}") else: print(line, end="")
def setUp(self): super().setUp() data_single = SingleLanePerSampleSingleEndFastqDirFmt( self.get_data_path('filter_samples_single_end/dir_fmt'), mode='r') self.sample_single = _PlotQualView(data_single, False) self.manifest_single = data_single.manifest.view(pd.DataFrame) self.md_single_all = Metadata.load( self.get_data_path('filter_samples_single_end/filter_all.tsv')) self.md_single_subset = Metadata.load( self.get_data_path('filter_samples_single_end/filter_subset.tsv')) self.md_single_none = Metadata.load( self.get_data_path('filter_samples_single_end/filter_none.tsv')) data_paired = SingleLanePerSamplePairedEndFastqDirFmt( self.get_data_path('filter_samples_paired_end/dir_fmt'), mode='r') self.sample_paired = _PlotQualView(data_paired, True) self.manifest_paired = data_paired.manifest.view(pd.DataFrame) self.md_paired_all = Metadata.load( self.get_data_path('filter_samples_single_end/filter_all.tsv')) self.md_paired_subset = Metadata.load( self.get_data_path('filter_samples_single_end/filter_subset.tsv')) self.md_paired_none = Metadata.load( self.get_data_path('filter_samples_single_end/filter_none.tsv'))
def do_demux_art(input_fp, metadata, metadata_bc_col, rev_bc, rev_map_bc, output_fp): """Imports data and runs demux on it Parameters ---------- input_fp: str Path to folder that contains sequences and barcodes. MUST be named sequences.fastq.gz and barcodes.fastq.gz metadata: str Path to metadata file that contains barcode sequences. See "sample-metadata.tsv" in mockrobiota datasets for examples. metadata_bc_col: str Name of column in metadata file that holds the barcode sequences rev_bc: bool Whether to reverse barcodes rev_map_bc: bool Whether to reverse mapping barcodes output_fp: str, optional or None Path to where we output demuxed qza file. Does not save if None Returns ------- Artifact demuxed sequences Metadata Associated metadata """ start = time.clock() if(input_fp is None or metadata is None or metadata_bc_col is None): click.echo("Run \'rdemux --help\' flag to see correct usage") return click.echo("Importing seq data from " + input_fp) art = Artifact.import_data("EMPSingleEndSequences", input_fp) click.echo("Loading metadata from " + metadata) barcode_metadata = Metadata.load(metadata) click.echo("Demuxing") demux, = emp_single(art, barcode_metadata.get_column(metadata_bc_col), rev_comp_barcodes=rev_bc, rev_comp_mapping_barcodes=rev_map_bc) if(output_fp is None): click.echo("Not saving demux output") else: demux.save(output_fp) click.echo("{}s for do_demux".format(str(time.clock() - start))) return demux, barcode_metadata
def get_user_input_query_lines(dictofFiles): ''' Uses a dictionary of files path/names(dictofFiles) to create a new dictionary (dict_of_file_lines) of arrays that represent the lines of each file from the origianal input dictionary (dictofFiles) Parameters ---------- dictofFiles: dictionary of strings Each key is what the string is for like metadata being the input metadata file. The element of the each key is a file path/name Returns ------- dict_of_file_lines: dictionary of arrays of strings The dictionary has the keys that have some value in dictofFiles. The elements are an array of the lines of the file the key corrisponds to. Raises ------ ValueError If a metadata file can't be loaded into a metadata object If a file can't be opened and have each of its lines read into the dictionary of input files ''' dict_of_file_lines = {} for key in dictofFiles: if dictofFiles[key] is None: continue if key == "metadata": if isinstance(dictofFiles[key], str): #read metadata file into metadata object print("metadata file path entered is %s" % (dictofFiles[key])) dict_of_file_lines[key] = Metadata.load(dictofFiles[key]) else: dict_of_file_lines[key] = dictofFiles[key] else: print("file path entered is %s" % (dictofFiles[key])) try: dict_of_file_lines[key] = open("./%s" % (dictofFiles[key]), "r").read().splitlines() except: raise ValueError("File could not be opened") return dict_of_file_lines
def load_env_metadata(env_metadata_path): # Use QIIME2 Metadata API to load metadata env_metadata_obj = Metadata.load(env_metadata_path) env_metadata_df = env_metadata_obj.to_dataframe() # Rename index env_metadata_df.index.names = ['SampleID'] # environmental metadata columns MUST be numeric type # Drop all non-numeric columns numeric_env_df = env_metadata_df.select_dtypes(include='number') if (len(numeric_env_df.columns) == 0): raise AXIOME3Error( "Environmental metadata must contain at least one numeric column!") return numeric_env_df
def load_metadata(metadata_path): # Use QIIME2 Metadata API to load metadata metadata_obj = Metadata.load(metadata_path) metadata_df = metadata_obj.to_dataframe() # Rename index metadata_df.index.names = ['SampleID'] # By default, pandas treats string as object # Convert object dtype to category cols = metadata_df.columns object_type_cols = cols[metadata_df.dtypes == object] for col in object_type_cols: convert_col_dtype(metadata_df, col, "category") return metadata_df
def add_metadata(metadata_file, columns_file, output_file): """Add new metadata columns to an existing metadata file (.tsv). The files '-i/--metadata-file' and '-c/--columns-file' must have at least one overlapping column. Parameters ---------- metadata_file Path to the metadata file. columns_file Path to a text file (.tsv) containing the columns to be added. The first row should be column names. output_file Path to the output file. """ mf1 = Metadata.load(metadata_file).to_dataframe() index_name = mf1.index.name dtypes = mf1.dtypes.to_dict() mf2 = pd.read_table(columns_file, keep_default_na=False) for k, v in dtypes.items(): if k in mf2.columns: if v == 'object': mf2[k] = mf2[k].astype(str) else: mf2[k] = mf2[k].astype(v) mf3 = mf1.reset_index().merge(mf2).set_index(index_name) mf3 = mf3.reindex(mf1.index) a = mf1.shape[0] b = mf3.shape[0] if a != b: message = (f"Final metadata (N={b}) has different number of samples " f"than input metadata (N={a}). Please double check " "whether this was intended.") warnings.warn(message) if mf3.isnull().values.any(): warnings.warn("Final metadata contains NaN. Please double check " "whether this was intended.") Metadata(mf3).save(output_file)
def manipulate_md(input_metadata_file, param_list, output_metadata_file, modification_func): """Automates a common I/O paradigm in Qeeseburger's scripts. Loads a metadata file as a pandas DataFrame, calls modification_func on the DF with some specified parameters (can be an empty list if there are no other parameters besides the metadata file), and outputs the modified metadata DF to an output path. """ # First off, load the metadata file and convert it to a DataFrame m = Metadata.load(input_metadata_file) m_df = m.to_dataframe() # ... Actually do relevant computations m_df_new = modification_func(m_df, *param_list) # Convert modified DataFrame back into a q2 Metadata object and save it Metadata(m_df_new).save(output_metadata_file)
def import_dataset(working_dir_fp, metadata_barcode_column, rev_comp_barcodes_in=False, rev_comp_mapping_barcodes_in=False): """Imports seqs as qiime artifact, demuxes them. Requires that fastq.gz files already be in working_dir_fp/emp-single-end-seqs and sample-metadata.tsv be in working_dir_fp Parameters ---------- working_dir_fp: str filepath where sequences_url + barcodes_url file are downloaded to and put into a directory "emp-single-end-sequences". Should also contain sample-metadata.tsv. Ideally, this should be a mock-<n> directory from when you clone the mockrobiota github repo should not end with "/" metadata_barcode_column: str column header in sample-metadata.tsv that holds barcode data rev_comp_barcodes_i: bool param to emp_single for reversing barcode seqs rev_comp_mapping_barcodes_i: bool param to emp_single for reversing barcode seqs in metadata Returns ------- demuxed seqs, loaded metadata OR None if fails """ print("Importing seq data") seqs = Artifact.import_data("EMPSingleEndSequences", working_dir_fp + "/emp-single-end-sequences") print("Loading metadata") barcode_metadata = Metadata.load(working_dir_fp + "/sample-metadata.tsv") print("Demuxing") demux, = emp_single(seqs, barcode_metadata.get_column(metadata_barcode_column), rev_comp_barcodes = rev_comp_barcodes_in, rev_comp_mapping_barcodes = rev_comp_mapping_barcodes_in) return demux, barcode_metadata
def get_user_input_query_lines(verbose, dictofFiles): ''' Uses a dictionary of files path/names(dictofFiles) to create a new dictionary (dict_of_file_lines) of arrays that represent the lines of each file from the origianal input dictionary (dictofFiles) Parameters ---------- verbose: boolean Tells function if it should output print statements or not. True outputs print statements. dictofFiles: dictionary of strings Each key is what the string is for like inputdata being the input metadata file. The element of the each key is a file path/name Returns ------- dict_of_file_lines: ditioary of arrays of strings The dictionary has the keys that have some value in dictofFiles. The elements are an array of the lines of the file the key corrisponds to. ''' #dictOfReturnValues = {"inputdata":None, "keep":None, "control":None, "case":None, "nullvalues":None, "match":None} dict_of_file_lines = {} for key in dictofFiles: if dictofFiles[key] is None: continue if key == "inputdata": #read metadata file into metadata object if verbose: print("metadata file path entered is %s"%(dictofFiles[key])) try: dict_of_file_lines[key] = Metadata.load(dictofFiles[key]) except: raise ValueError('metadata file could not load. The file must be a TSV metadata file.') else: if verbose: print("file path entered is %s"%(dictofFiles[key])) try: dict_of_file_lines[key] = open('./%s'%(dictofFiles[key]),'r').readlines() except: raise ValueError('File could not be opened') return dict_of_file_lines
def get_mf(metadata): """Convert a file or object from QIIME 2 metadata to a dataframe. This method automatically detects the type of input metadata and then converts it to a pandas.DataFrame object. Parameters ---------- metadata : str or qiime2.Metadata Metadata file or object. Returns ------- pandas.DataFrame DataFrame object containing metadata. Examples -------- This is a simple example. >>> mf = dokdo.get_mf('/Users/sbslee/Desktop/dokdo/data/moving-pictures-tutorial/sample-metadata.tsv') >>> mf.head() barcode-sequence body-site ... reported-antibiotic-usage days-since-experiment-start sample-id ... L1S8 AGCTGACTAGTC gut ... Yes 0.0 L1S57 ACACACTATGGC gut ... No 84.0 L1S76 ACTACGTGTGGT gut ... No 112.0 L1S105 AGTGCGATGCGT gut ... No 140.0 L2S155 ACGATGCGACCA left palm ... No 84.0 """ if isinstance(metadata, str): mf = Metadata.load(metadata).to_dataframe() elif isinstance(metadata, Metadata): mf = metadata.to_dataframe() else: raise TypeError(f"Incorrect metadata type: {type(metadata)}") return mf
def setUp(self): super().setUp() # Just for reference for anyone reading this, self.plugin is set upon # calling super().setUp() which looks at the "package" variable set # above self.plot = self.plugin.visualizers["plot"] # Load the various input QZAs/etc. needed to run this test prefixdir = os.path.join("docs", "moving-pictures") self.tree = Artifact.load(os.path.join(prefixdir, "rooted-tree.qza")) self.table = Artifact.load(os.path.join(prefixdir, "table.qza")) self.md = Metadata.load(os.path.join(prefixdir, "sample_metadata.tsv")) # We have to transform the taxonomy QZA to Metadata ourselves self.taxonomy = Artifact.load(os.path.join(prefixdir, "taxonomy.qza")) self.fmd = self.taxonomy.view(Metadata) # Helps us distinguish between if the test was successful or not self.result = None # If the test was successful, we'll save the output QZV to this path # during tearDown(). self.output_path = os.path.join(prefixdir, "empress-tree.qzv")
def _10(ff: ErrorCorrectionDetailsFmt) -> Metadata: return Metadata.load(str(ff))
elif currentArgument in ("-n", "--nullValues"): user_input_file_null_values = currentValue elif currentArgument in ("-m", "--match"): user_input_file_name_match = currentValue elif currentArgument in ("-o", "--output"): outputFileName = currentValue if outputFileName == '': print('output put file name not entered') sys.exit() if file_of_metadata == '': print('metadata file not found') sys.exit() #read metadata file into metadata object try: originalMD = Metadata.load(file_of_metadata) except: print( 'metadata file could not load. If you entered a valid path then try clearing the formating. The file must be a TSV metadata file.' ) print("metadata file path entered is %s" % (file_of_metadata)) sys.exit() #each line is a sqlite query to determine what samples to keep exclude_query_lines_input = get_user_input_query_lines( user_input_file_name_exclude) #each line is a sqlite query to determine what samples to label control control_query_lines_input = get_user_input_query_lines( user_input_file_name_control) #each line is a sqlite query to determine what samples to label case case_query_lines_input = get_user_input_query_lines(
print("age_years columns don't match") print(csvdata["age_years"]) print(csvdata_match["age_years"]) try: assert_frame_equal(csvdata, csvdata_match) except: return False return True def test_Everything(verbose, inputdata, keep, control, case, nullvalues, match, output, csvdata_keep, csvdata_case_control, csvdata_filter, csvdata_match): csvdata_keep = Metadata.load(csvdata_keep).to_dataframe() csvdata_case_control = Metadata.load(csvdata_case_control).to_dataframe() csvdata_filter = Metadata.load(csvdata_filter).to_dataframe() csvdata_match = Metadata.load(csvdata_match).to_dataframe() tstart = time.clock() inputDict = { "inputdata": inputdata, "keep": keep, "control": control, "case": case, "nullvalues": nullvalues, "match": match } #loads and opens input files inputDict = match_controls.get_user_input_query_lines(verbose, inputDict)
def _9(ff: ErrorCorrectionDetailsFmt) -> pd.DataFrame: return Metadata.load(str(ff)).to_dataframe()
def cast_metadata(paths, cast, output_file, ignore_extra, error_on_missing): import tempfile from qiime2 import Metadata, metadata md = _merge_metadata(paths) cast_dict = {} try: for casting in cast: if ':' not in casting: raise click.BadParameter( message=f'Missing `:` in --cast {casting}', param_hint='cast') splitter = casting.split(':') if len(splitter) != 2: raise click.BadParameter( message=f'Incorrect number of fields in --cast {casting}.' f' Observed {len(splitter)}' f' {tuple(splitter)}, expected 2.', param_hint='cast') col, type_ = splitter if col in cast_dict: raise click.BadParameter( message=(f'Column name "{col}" appears in cast more than' ' once.'), param_hint='cast') cast_dict[col] = type_ except Exception as err: header = \ ('Could not parse provided cast arguments into unique COLUMN:TYPE' ' pairs. Please make sure all cast flags are of the format --cast' ' COLUMN:TYPE') q2cli.util.exit_with_error(err, header=header) types = set(cast_dict.values()) if not types.issubset(_COLUMN_TYPES): raise click.BadParameter( message=('Unknown column type provided. Please make sure all' ' columns included in your cast contain a valid column' ' type. Valid types: %s' % (', '.join(_COLUMN_TYPES))), param_hint='cast') column_names = set(md.columns.keys()) cast_names = set(cast_dict.keys()) if not ignore_extra: if not cast_names.issubset(column_names): cast = cast_names.difference(column_names) raise click.BadParameter( message=('The following cast columns were not found' ' within the metadata: %s' % (', '.join(cast))), param_hint='cast') if error_on_missing: if not column_names.issubset(cast_names): cols = column_names.difference(cast_names) raise click.BadParameter( message='The following columns within the metadata' ' were not provided in the cast: %s' % (', '.join(cols)), param_hint='cast') # Remove entries from the cast dict that are not in the metadata to avoid # errors further down the road for cast in cast_names: if cast not in column_names: cast_dict.pop(cast) with tempfile.NamedTemporaryFile() as temp: md.save(temp.name) try: cast_md = Metadata.load(temp.name, cast_dict) except metadata.io.MetadataFileError as e: raise click.BadParameter(message=e, param_hint='cast') from e if output_file: cast_md.save(output_file) else: with tempfile.NamedTemporaryFile(mode='w+') as stdout_temp: cast_md.save(stdout_temp.name) stdout_str = stdout_temp.read() click.echo(stdout_str)
def load_mp_data(use_artifact_api=True, is_empire=True): """Loads data from the QIIME 2 moving pictures tutorial for visualization. It's assumed that this data is already stored in docs/moving-pictures/, aka the PREFIX_DIR global variable set above, which should be located relative to where this function is being run from. If this directory or the data files within it cannot be accessed, this function will (probably) break. Parameters ---------- use_artifact_api: bool, optional (default True) If True, this will load the artifacts using the QIIME 2 Artifact API, and the returned objects will have types corresponding to the first listed types (before the | characters) shown below. If False, this will instead load the artifacts without using QIIME 2's APIs; in this case, the returned objects will have types corresponding to the second listed types (after the | characters) shown below. is_empire: bool, optional(default True) If True, this will return an ordination. If False, will return None in place of an ordination. Returns ------- (tree, table, md, fmd, ordination) tree: qiime2.Artifact | skbio.tree.TreeNode Phylogenetic tree. table: qiime2.Artifact | biom.Table Feature table. md: qiime2.Metadata | pandas.DataFrame Sample metadata. fmd: qiime2.Metadata | pandas.DataFrame Feature metadata. (Although this is stored in the repository as a FeatureData[Taxonomy] artifact, we transform it to Metadata if use_artifact_api is True.) pcoa: qiime2.Artifact | skbio.OrdinationResults | None """ q2_tree_loc = os.path.join(PREFIX_DIR, "rooted-tree.qza") q2_table_loc = os.path.join(PREFIX_DIR, "table.qza") q2_pcoa_loc = os.path.join(PREFIX_DIR, "unweighted_unifrac_pcoa_results.qza") q2_tax_loc = os.path.join(PREFIX_DIR, "taxonomy.qza") md_loc = os.path.join(PREFIX_DIR, "sample_metadata.tsv") if use_artifact_api: from qiime2 import Artifact, Metadata tree = Artifact.load(q2_tree_loc) table = Artifact.load(q2_table_loc) pcoa = Artifact.load(q2_pcoa_loc) if is_empire else None md = Metadata.load(md_loc) # We have to transform the taxonomy QZA to Metadata ourselves fmd = Artifact.load(q2_tax_loc).view(Metadata) else: import biom import pandas as pd from skbio.stats.ordination import OrdinationResults from skbio.tree import TreeNode with tempfile.TemporaryDirectory() as _tmp: tree_loc = extract_q2_artifact_to_path(_tmp, q2_tree_loc, "tree.nwk") tree = TreeNode.read(tree_loc) tbl_loc = extract_q2_artifact_to_path(_tmp, q2_table_loc, "feature-table.biom") table = biom.load_table(tbl_loc) if is_empire: pcoa_loc = extract_q2_artifact_to_path(_tmp, q2_pcoa_loc, "ordination.txt") pcoa = OrdinationResults.read(pcoa_loc) else: pcoa = None tax_loc = extract_q2_artifact_to_path(_tmp, q2_tax_loc, "taxonomy.tsv") fmd = pd.read_csv(tax_loc, sep="\t", index_col=0) md = pd.read_csv(md_loc, sep="\t", index_col=0, skiprows=[1]) return tree, table, md, fmd, pcoa
# - host_ids: a list of host subject IDs in the metadata file to subset the # sinks to # # Outputs: Three modified metadata files, where each file includes just: # - 300 "source" samples # - All "sink" samples of a specified empo_3 category # Furthermore, a "SourceSink" column will be added to each metadata file and # set accordingly. from qiime2 import Metadata from collections import Counter host_ids = ["host1", "host2"] print("loading metadata...") df = Metadata.load("smooshed-metadata.txt").to_dataframe() agp_sample_ids = set(df.loc[df.index.str.startswith("10317.")].index) # 1. Construct a "Source" from each of the 3 AGP empo3 vals agp_ids_to_use = [] for e in ["Animal distal gut", "Animal secretion", "Animal surface"]: # save the df subsets so we only have to do this once empo3subset = df[df["empo_3"] == e] agp_from_this_empo3 = set(empo3subset.index) & agp_sample_ids # Sort the list of IDs then take the first 100 agp_empo3_subset = sorted(agp_from_this_empo3)[:100] agp_ids_to_use += agp_empo3_subset # TODO subset the sinks as well? We could theoretically parallelize this across
def run_integration_test( input_dir_name, output_dir_name, ranks_name, table_name, sample_metadata_name, feature_metadata_name=None, use_q2=False, q2_ranking_tool="songbird", expected_unsupported_samples=0, expected_unsupported_features=0, expect_all_unsupported_samples=False, q2_table_biom_format="BIOMV210Format", extreme_feature_count=None, ): """Runs qurro, and validates the output somewhat. Note that this is a pretty outdated function (as in, it doesn't support checking many of the corner cases/etc. that happen when running Qurro). The main purpose of this function is just checking at a high level that things look good, and that data is faithfully represented in the output main.js file. """ in_dir = os.path.join("qurro", "tests", "input", input_dir_name) rloc = os.path.join(in_dir, ranks_name) tloc = os.path.join(in_dir, table_name) sloc = os.path.join(in_dir, sample_metadata_name) floc = None if feature_metadata_name is not None: floc = os.path.join(in_dir, feature_metadata_name) out_dir = os.path.join("docs", "demos", output_dir_name) rrv_qzv = result = None if use_q2: if q2_ranking_tool == "songbird": q2_action = q2qurro.actions.differential_plot q2_rank_type = "FeatureData[Differential]" elif q2_ranking_tool == "DEICODE": q2_action = q2qurro.actions.loading_plot q2_rank_type = "PCoAResults % Properties(['biplot'])" else: raise ValueError( "Unknown q2_ranking_tool: {}".format(q2_ranking_tool) ) # Import all of these files as Q2 artifacts or metadata. rank_qza = Artifact.import_data(q2_rank_type, rloc) table_qza = Artifact.import_data( "FeatureTable[Frequency]", tloc, view_type=q2_table_biom_format ) sample_metadata = Metadata.load(sloc) feature_metadata = None if floc is not None: feature_metadata = Metadata.load(floc) # Now that everything's imported, try running qurro rrv_qzv = q2_action( ranks=rank_qza, table=table_qza, sample_metadata=sample_metadata, feature_metadata=feature_metadata, extreme_feature_count=extreme_feature_count, ) # Output the contents of the visualization to out_dir. rrv_qzv.visualization.export_data(out_dir) else: # Run qurro "standalone" -- i.e. outside of QIIME 2 runner = CliRunner() args = [ "--ranks", rloc, "--table", tloc, "--sample-metadata", sloc, "--output-dir", out_dir, ] if floc is not None: args += ["--feature-metadata", floc] if extreme_feature_count is not None: args += ["--extreme-feature-count", extreme_feature_count] result = runner.invoke(rrvp.plot, args) # Validate that the correct exit code and output were recorded validate_standalone_result( result, expected_unsupported_samples=expected_unsupported_samples, expect_all_unsupported_samples=expect_all_unsupported_samples, expected_unsupported_features=expected_unsupported_features, ) # If we expected this test to fail due to invalid inputs, don't bother # doing any JSON validation. # (Input validity checking is done in generate.process_input(), before # any output files are created in generate.gen_visualization() -- so no # output should be created anyway in these cases.) if expect_all_unsupported_samples or expected_unsupported_features > 0: return None, None else: # Only validate JSONs if -x wasn't specified (i.e. the passed # extreme feature count is None) validate_jsons = extreme_feature_count is None rank_json, sample_json, count_json = validate_main_js( out_dir, rloc, tloc, sloc, validate_jsons=validate_jsons ) return rank_json, sample_json, count_json
def prepare_lefse(table_file, taxonomy_file, metadata_file, output_file, class_col, subclass_col=None, subject_col=None, where=None): """Create a TSV file which can be used as input for the LEfSe tool. This command 1) collapses the input feature table at the genus level, 2) computes relative frequency of the features, 3) performs sample filtration if requested, 4) changes the format of feature names, 5) adds the relevant metadata as 'Class', 'Subclass', and 'Subject', and 6) writes a text file which can be used as input for LEfSe. Parameters ---------- table_file : str Path to the table file with the 'FeatureTable[Frequency]' type. taxonomy_file : str Path to the taxonomy file with the 'FeatureData[Taxonomy]' type. metadata_file : str Path to the metadata file. output_file : str Path to the output file. class_col : str Metadata column used as 'Class' by LEfSe. subclass_col : str, optional Metadata column used as 'Subclass' by LEfSe. subject_col : str, optional Metadata column used as 'Subject' by LEfSe. where : str, optional SQLite 'WHERE' clause specifying sample metadata criteria. """ _ = taxa.methods.collapse(table=Artifact.load(table_file), taxonomy=Artifact.load(taxonomy_file), level=6) _ = feature_table.methods.relative_frequency(table=_.collapsed_table) if where is None: df = _.relative_frequency_table.view(pd.DataFrame) else: _ = feature_table.methods.filter_samples( table=_.relative_frequency_table, metadata=Metadata.load(metadata_file), where=where) df = _.filtered_table.view(pd.DataFrame) def f(x): for c in ['-', '[', ']', '(', ')', ' ']: x = x.replace(c, '_') ranks = x.split(';') base = ranks[0] result = [base] for i, rank in enumerate(ranks[1:], start=2): if rank == '__': result.append(f'{base}_x__L{i}') elif rank.split('__')[1] == '': result.append(f'{base}_{rank}L{i}') else: result.append(rank) base = rank return '|'.join(result) df.columns = [f(x) for x in df.columns.to_list()] mf = dokdo.get_mf(metadata_file) mf = mf.replace(' ', '_', regex=True) cols = mf.columns.to_list() df = pd.concat([df, mf], axis=1, join="inner") df.insert(0, class_col, df.pop(class_col)) cols.remove(class_col) if subclass_col is None and subject_col is None: pass elif subclass_col is not None and subject_col is None: df.insert(1, subclass_col, df.pop(subclass_col)) cols.remove(subclass_col) elif subclass_col is None and subject_col is not None: df.insert(1, subject_col, df.pop(subject_col)) cols.remove(subject_col) else: df.insert(1, subclass_col, df.pop(subclass_col)) df.insert(2, subject_col, df.pop(subject_col)) cols.remove(subclass_col) cols.remove(subject_col) df.drop(columns=cols, inplace=True) df.T.to_csv(output_file, header=False, sep='\t')