示例#1
0
def test_determine_cases_and_controls(unit, normal_input, normal_output,
                                      normal_control, normal_case,
                                      noentries_case, empty_file):
    '''
    Tests the programs labeling of case and control samples

    Parameters
    ----------
    verbose: boolean
        Tells function if it should output print statements or not.
            True outputs print statements.
    unit: string
        Location of the folder that holds the unit test files
    normal_input: string
        Name of the tsv file that will be loaded into a metadata object. This
            object will be used to test the case and control labeling
            functionality of the match_functions.py
    normal_output: string
        Name of the tsv file that will be loaded into a metadata object. This
            object will be used to test that valid case and control inputs get
            the correct output
    normal_control: string
        Name of the query file that contains the normal control query using IN
    normal_case: string
        Name of the query file that contains the normal case query
    noentries_case: string
        Name of the query file that results in no cases being found. This tests
            the that filtering everything out gives an error.
    empty_file: string
        Name of the empty file used in various tests
    '''
    extra = False
    norm_in = Metadata.load("./%s/%s" % (unit, normal_input))
    norm_out = Metadata.load("./%s/%s" % (unit, normal_output))

    norm_case = open("./%s/%s" % (unit, normal_case), "r").read().splitlines()
    norm_control = open("./%s/%s" % (unit, normal_control),
                        "r").read().splitlines()
    noentry_case = open("./%s/%s" % (unit, noentries_case),
                        "r").read().splitlines()
    emp_file = open("./%s/%s" % (unit, empty_file), "r").read().splitlines()

    case_control_dict = {"case": norm_case, "control": norm_control}
    unit_norm_out = match_functions.determine_cases_and_controls(
        norm_in, case_control_dict, extra)

    case_control_dict = {"case": emp_file, "control": emp_file}
    assert_raises(ValueError, match_functions.determine_cases_and_controls,
                  norm_in, case_control_dict, extra)

    case_control_dict = {"case": noentry_case, "control": norm_control}
    assert_raises(ValueError, match_functions.determine_cases_and_controls,
                  norm_in, case_control_dict, extra)

    norm_out = norm_out.to_dataframe()
    unit_norm_out = unit_norm_out.to_dataframe()
    assert_frame_equal(norm_out, unit_norm_out)
示例#2
0
def test_keep_samples(unit, normal_input, normal_output, normal_keep,
                      noentries_keep, empty_file):
    '''
    Test the programs filtering out of unwanted samples based on sql queries

    Parameters
    ----------
    verbose: boolean
        Tells function if it should output print statements or not.
            True outputs print statements.
    unit: string
        Location of the folder that holds the unit test files
    normal_input: string
        Name of the tsv file that will be loaded into a metadata object. This
            object will be used to test the keeping of samples functionality
            of the match_functions.py
    normal_output: string
        Name of the tsv file that will be loaded into a metadata object. This
            object will be used to test that valid inputs for keeping samples
            gets the correct output
    normal_keep: string
        Name of the query file that contains the sql queries to do a normal
            keep
    noentries_keep: string
        Name of the query file that contains the sql queries that keeps not
            samples
    empty_file: string
        Name of the empty file used in various tests
    '''
    extra = False
    norm_in = Metadata.load("./%s/%s" % (unit, normal_input))
    norm_out = Metadata.load("./%s/%s" % (unit, normal_output))

    norm_keep = open("./%s/%s" % (unit, normal_keep), "r").read().splitlines()
    noentry_keep = open("./%s/%s" % (unit, noentries_keep),
                        "r").read().splitlines()
    emp_file = open("./%s/%s" % (unit, empty_file), "r").read().splitlines()

    unit_norm_out = match_functions.keep_samples(norm_in, norm_keep, extra)

    assert_raises(ValueError, match_functions.keep_samples, norm_in, emp_file,
                  extra)
    assert_raises(ValueError, match_functions.keep_samples, norm_in,
                  noentry_keep, extra)

    unit_norm_out = unit_norm_out.to_dataframe()
    norm_out = norm_out.to_dataframe()
    assert_frame_equal(norm_out, unit_norm_out)
示例#3
0
    def setUp(self):
        super().setUp()
        self.preprocess = self.plugin.pipelines['preprocess']

        continuous_metadata = pd.DataFrame(
            {
                'target': ['1.0', '2.0', '3.0', '4.0'],
                'contain_nan': ['3.3', '3.5', None, '3.9']
            },
            index=pd.Index(['A', 'B', 'C', 'D'], name='id'))
        self.continuous_metadata = continuous_metadata

        discrete_metadata = pd.DataFrame(
            {
                'target': ['0', '1', '0', '1'],
                'target_int': [1, 0, 1, 0],
                'contain_nan': ['0', '1', None, '1'],
                'non_encoded': ['10', '2', '', 'b']
            },
            index=pd.Index(['A', 'B', 'C', 'D'], name='id'))
        self.discrete_metadata = discrete_metadata

        TEST_DIR = path.split(__file__)[0]
        md_path = path.join(TEST_DIR, 'data/sample-metadata-binary.tsv')
        table_path = path.join(TEST_DIR, 'data/table.qza')
        rooted_tree_path = path.join(TEST_DIR, 'data/rooted-tree.qza')
        unrooted_tree_path = path.join(TEST_DIR, 'data/unrooted-tree.qza')

        self.mp_sample_metadata = Metadata.load(md_path)
        self.mp_table = Artifact.load(table_path)
        self.mp_rooted_tree = Artifact.load(rooted_tree_path)
        self.mp_unrooted_tree = Artifact.load(unrooted_tree_path)
示例#4
0
    def test_primitive_passed_incorrectly(self):
        concatenate_ints = self.plugin.methods['concatenate_ints']
        identity_with_metadata = self.plugin.methods['identity_with_metadata']
        params_only_method = self.plugin.methods['params_only_method']

        md_fp = get_data_path('valid/simple.tsv')
        inappropriate_metadata = Metadata.load(md_fp)

        ints1 = Artifact.import_data(IntSequence1, [0, 42, 43])
        ints3 = Artifact.import_data(IntSequence1, [12, 111])
        int1 = 4
        int2 = 5
        arbitrary_int = 43

        # tests primitive int passed as IntSequence artifact
        with self.assertRaisesRegex(TypeError,
                                    'ints2.*43.*incompatible.*IntSequence1'):
            concatenate_ints(ints1, arbitrary_int, ints3, int1, int2)

        # tests primitive passed as metadata
        with self.assertRaisesRegex(TypeError,
                                    'metadata.*43.*incompatible.*Metadata'):
            identity_with_metadata(ints1, arbitrary_int)

        # tests wrong type of primitive passed
        with self.assertRaisesRegex(TypeError,
                                    'age.*arbitraryString.*incompatible.*Int'):
            params_only_method('key string', 'arbitraryString')

        # tests metadata passed as artifact
        with self.assertRaisesRegex(TypeError,
                                    '\'ints2\'.*Metadata.*IntSequence1'):
            concatenate_ints(ints1, inappropriate_metadata, ints3, int1, int2)
示例#5
0
文件: cli.py 项目: KTbiotech/dokdo
def merge_metadata(metadata, output):
    dfs = []

    for file in metadata:
        dfs.append(Metadata.load(file).to_dataframe())

    Metadata(pd.concat(dfs)).save(output)
示例#6
0
def get_itol_barchart(fdata: pd.DataFrame, table_file: str, metadata_file: str,
                      metadata_column: str, output_file: str):
    '''Generate a table in QIIME 2 artifact format which can be directly
    parsed by iTOL and yield a multi-bar chart.
    '''
    # load sample feature table
    table = Artifact.load(table_file)

    # extract BIOM table
    table = table.view(biom.Table)

    # load sample metadata
    meta = Metadata.load(metadata_file)

    # generate a sample Id to category map
    column = meta.get_column(metadata_column).drop_missing_values()
    catmap = column.to_series().to_dict()

    # collapse feature table by category
    # note: when multiple samples map to one category, take **mean**
    table = table.collapse(lambda i, _: catmap[i], norm=True, axis='sample')

    # import BIOM table into QIIME 2 and save
    res = Artifact.import_data('FeatureTable[Frequency]', table)
    res.save(output_file)
示例#7
0
文件: cli.py 项目: KTbiotech/dokdo
def add_metadata(metadata, columns, output):
    mf1 = Metadata.load(metadata).to_dataframe()
    index_name = mf1.index.name
    dtypes = mf1.dtypes.to_dict()
    mf2 = pd.read_table(columns, keep_default_na=False)

    for k, v in dtypes.items():
        if k in mf2.columns:
            if v == 'object':
                mf2[k] = mf2[k].astype(str)
            else:
                mf2[k] = mf2[k].astype(v)

    mf3 = mf1.reset_index().merge(mf2).set_index(index_name)

    a = mf1.shape[0]
    b = mf3.shape[0]

    if a != b:
        message = (f"Final metadata (N={b}) has different number of samples "
                   f"than input metadata (N={a}). Please double check "
                   "whether this was intended.")
        warnings.warn(message)

    Metadata(mf3).save(output)
示例#8
0
def load_mp_data():
    """Loads data from the QIIME 2 moving pictures tutorial for visualization.

    It's assumed that this data is already stored in docs/moving-pictures/, aka
    the PREFIX_DIR global variable set above, which should be located relative
    to where this function is being run from. If this directory or the data
    files within it cannot be accessed, this function will (probably) break.

    Returns
    -------
    (tree, table, md, fmd, ordination)
        tree: Artifact with semantic type Phylogeny[Rooted]
            Phylogenetic tree.
        table: Artifact with semantic type FeatureTable[Frequency]
            Feature table.
        md: Metadata
            Sample metadata.
        fmd: Metadata
            Feature metadata. (Although this is stored in the repository as a
            FeatureData[Taxonomy] artifact, we transform it to Metadata.)
        pcoa: Artifact with semantic type PCoAResults
            Ordination.
    """
    tree = Artifact.load(os.path.join(PREFIX_DIR, "rooted-tree.qza"))
    table = Artifact.load(os.path.join(PREFIX_DIR, "table.qza"))
    pcoa = Artifact.load(
        os.path.join(PREFIX_DIR, "unweighted_unifrac_pcoa_results.qza")
    )
    md = Metadata.load(os.path.join(PREFIX_DIR, "sample_metadata.tsv"))
    # We have to transform the taxonomy QZA to Metadata ourselves
    taxonomy = Artifact.load(os.path.join(PREFIX_DIR, "taxonomy.qza"))
    fmd = taxonomy.view(Metadata)
    return tree, table, md, fmd, pcoa
示例#9
0
def _load_q2_metadata(metadata_path, name):
    try:
        new_resource = Metadata.load(metadata_path)
    except TypeError:
        # if metadata_path is some type that does not have '+' method with
        #  str, e.g., dict then q2 metadata will get a type error. Except this
        #  error and give a MetadataFileError, which is more informative
        raise MetadataFileError(str(metadata_path))
    return new_resource.to_dataframe()
def setup(feature_table, sample_metadata):
    try:
        Artifact.load(feature_table)
    except Exception as e:
        raise ValueError(e)

    try:
        Metadata.load(sample_metadata)
    except Exception as e:
        raise ValueError(e)

    os.mkdir("pbs_out")

    with fileinput.input("config/config.yaml", inplace=True) as f:
        for line in f:
            if f.filelineno() == 1:
                print(f"feature_table: {feature_table}")
            elif f.filelineno() == 2:
                print(f"sample_metadata: {sample_metadata}")
            else:
                print(line, end="")
示例#11
0
    def setUp(self):
        super().setUp()

        data_single = SingleLanePerSampleSingleEndFastqDirFmt(
            self.get_data_path('filter_samples_single_end/dir_fmt'), mode='r')
        self.sample_single = _PlotQualView(data_single, False)
        self.manifest_single = data_single.manifest.view(pd.DataFrame)

        self.md_single_all = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_all.tsv'))
        self.md_single_subset = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_subset.tsv'))
        self.md_single_none = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_none.tsv'))

        data_paired = SingleLanePerSamplePairedEndFastqDirFmt(
            self.get_data_path('filter_samples_paired_end/dir_fmt'), mode='r')
        self.sample_paired = _PlotQualView(data_paired, True)
        self.manifest_paired = data_paired.manifest.view(pd.DataFrame)

        self.md_paired_all = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_all.tsv'))
        self.md_paired_subset = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_subset.tsv'))
        self.md_paired_none = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_none.tsv'))
示例#12
0
def do_demux_art(input_fp, metadata, metadata_bc_col, rev_bc, rev_map_bc, output_fp):
    """Imports data and runs demux on it

    Parameters
    ----------
    input_fp: str
        Path to folder that contains sequences and barcodes. MUST be
        named sequences.fastq.gz and barcodes.fastq.gz
    metadata: str
        Path to metadata file that contains barcode sequences. See
        "sample-metadata.tsv" in mockrobiota datasets for examples.
    metadata_bc_col: str
        Name of column in metadata file that holds the barcode sequences
    rev_bc: bool
        Whether to reverse barcodes
    rev_map_bc: bool
        Whether to reverse mapping barcodes
    output_fp: str, optional or None
        Path to where we output demuxed qza file. Does not save if None

    Returns
    -------
    Artifact
        demuxed sequences
    Metadata
        Associated metadata
    """
    start = time.clock()
    if(input_fp is None or metadata is None or metadata_bc_col is None):
        click.echo("Run \'rdemux --help\' flag to see correct usage")
        return

    click.echo("Importing seq data from " + input_fp)
    art = Artifact.import_data("EMPSingleEndSequences", input_fp)

    click.echo("Loading metadata from " + metadata)
    barcode_metadata = Metadata.load(metadata)

    click.echo("Demuxing")
    demux, = emp_single(art,
                        barcode_metadata.get_column(metadata_bc_col),
                        rev_comp_barcodes=rev_bc,
                        rev_comp_mapping_barcodes=rev_map_bc)

    if(output_fp is None):
        click.echo("Not saving demux output")
    else:
        demux.save(output_fp)

    click.echo("{}s for do_demux".format(str(time.clock() - start)))
    return demux, barcode_metadata
示例#13
0
def get_user_input_query_lines(dictofFiles):
    '''
    Uses a dictionary of files path/names(dictofFiles) to create a new
        dictionary (dict_of_file_lines) of arrays that represent the lines of
        each file from the origianal input dictionary (dictofFiles)

    Parameters
    ----------
    dictofFiles: dictionary of strings
        Each key is what the string is for like metadata being the input
            metadata file. The element of the each key is a file path/name

    Returns
    -------
    dict_of_file_lines: dictionary of arrays of strings
        The dictionary has the keys that have some value in dictofFiles. The
            elements are an array of the lines of the file the key corrisponds
            to.

    Raises
    ------
    ValueError
        If a metadata file can't be loaded into a metadata object
        If a file can't be opened and have each of its lines read into the
            dictionary of input files

    '''
    dict_of_file_lines = {}
    for key in dictofFiles:
        if dictofFiles[key] is None:
            continue
        if key == "metadata":
            if isinstance(dictofFiles[key], str):
                #read metadata file into metadata object
                print("metadata file path entered is %s" % (dictofFiles[key]))
                dict_of_file_lines[key] = Metadata.load(dictofFiles[key])
            else:
                dict_of_file_lines[key] = dictofFiles[key]

        else:
            print("file path entered is %s" % (dictofFiles[key]))
            try:
                dict_of_file_lines[key] = open("./%s" % (dictofFiles[key]),
                                               "r").read().splitlines()
            except:
                raise ValueError("File could not be opened")

    return dict_of_file_lines
示例#14
0
def load_env_metadata(env_metadata_path):
    # Use QIIME2 Metadata API to load metadata
    env_metadata_obj = Metadata.load(env_metadata_path)
    env_metadata_df = env_metadata_obj.to_dataframe()

    # Rename index
    env_metadata_df.index.names = ['SampleID']
    # environmental metadata columns MUST be numeric type
    # Drop all non-numeric columns
    numeric_env_df = env_metadata_df.select_dtypes(include='number')

    if (len(numeric_env_df.columns) == 0):
        raise AXIOME3Error(
            "Environmental metadata must contain at least one numeric column!")

    return numeric_env_df
示例#15
0
def load_metadata(metadata_path):
    # Use QIIME2 Metadata API to load metadata
    metadata_obj = Metadata.load(metadata_path)
    metadata_df = metadata_obj.to_dataframe()

    # Rename index
    metadata_df.index.names = ['SampleID']
    # By default, pandas treats string as object
    # Convert object dtype to category
    cols = metadata_df.columns
    object_type_cols = cols[metadata_df.dtypes == object]

    for col in object_type_cols:
        convert_col_dtype(metadata_df, col, "category")

    return metadata_df
示例#16
0
def add_metadata(metadata_file, columns_file, output_file):
    """Add new metadata columns to an existing metadata file (.tsv).

    The files '-i/--metadata-file' and '-c/--columns-file' must have at
    least one overlapping column.

    Parameters
    ----------
    metadata_file
        Path to the metadata file.
    columns_file
        Path to a text file (.tsv) containing the columns to be added.
        The first row should be column names.
    output_file
        Path to the output file.
    """
    mf1 = Metadata.load(metadata_file).to_dataframe()
    index_name = mf1.index.name
    dtypes = mf1.dtypes.to_dict()
    mf2 = pd.read_table(columns_file, keep_default_na=False)

    for k, v in dtypes.items():
        if k in mf2.columns:
            if v == 'object':
                mf2[k] = mf2[k].astype(str)
            else:
                mf2[k] = mf2[k].astype(v)

    mf3 = mf1.reset_index().merge(mf2).set_index(index_name)
    mf3 = mf3.reindex(mf1.index)

    a = mf1.shape[0]
    b = mf3.shape[0]

    if a != b:
        message = (f"Final metadata (N={b}) has different number of samples "
                   f"than input metadata (N={a}). Please double check "
                   "whether this was intended.")
        warnings.warn(message)

    if mf3.isnull().values.any():
        warnings.warn("Final metadata contains NaN. Please double check "
                      "whether this was intended.")

    Metadata(mf3).save(output_file)
示例#17
0
def manipulate_md(input_metadata_file, param_list, output_metadata_file,
                  modification_func):
    """Automates a common I/O paradigm in Qeeseburger's scripts.

       Loads a metadata file as a pandas DataFrame, calls modification_func on
       the DF with some specified parameters (can be an empty list if there are
       no other parameters besides the metadata file), and outputs the modified
       metadata DF to an output path.
    """
    # First off, load the metadata file and convert it to a DataFrame
    m = Metadata.load(input_metadata_file)
    m_df = m.to_dataframe()

    # ... Actually do relevant computations
    m_df_new = modification_func(m_df, *param_list)

    # Convert modified DataFrame back into a q2 Metadata object and save it
    Metadata(m_df_new).save(output_metadata_file)
示例#18
0
def import_dataset(working_dir_fp, metadata_barcode_column,
                   rev_comp_barcodes_in=False,
                   rev_comp_mapping_barcodes_in=False):
    """Imports seqs as qiime artifact, demuxes them.
    Requires that fastq.gz files already be in
    working_dir_fp/emp-single-end-seqs and sample-metadata.tsv be in
    working_dir_fp

    Parameters
    ----------
    working_dir_fp: str
        filepath where sequences_url + barcodes_url file are
        downloaded to and put into a directory "emp-single-end-sequences".
        Should also contain sample-metadata.tsv. Ideally, this should be a
        mock-<n> directory from when you clone the mockrobiota github repo
        should not end with "/"
    metadata_barcode_column: str
        column header in sample-metadata.tsv that holds barcode data
    rev_comp_barcodes_i: bool
        param to emp_single for reversing barcode seqs
    rev_comp_mapping_barcodes_i: bool
        param to emp_single for reversing barcode seqs in metadata

    Returns
    -------
    demuxed seqs,
    loaded metadata
    OR
    None if fails
    """
    print("Importing seq data")
    seqs = Artifact.import_data("EMPSingleEndSequences", working_dir_fp +
                                "/emp-single-end-sequences")

    print("Loading metadata")
    barcode_metadata = Metadata.load(working_dir_fp + "/sample-metadata.tsv")

    print("Demuxing")
    demux, = emp_single(seqs,
                        barcode_metadata.get_column(metadata_barcode_column),
                        rev_comp_barcodes = rev_comp_barcodes_in,
                        rev_comp_mapping_barcodes =
                        rev_comp_mapping_barcodes_in)
    return demux, barcode_metadata
def get_user_input_query_lines(verbose, dictofFiles):
    '''
    Uses a dictionary of files path/names(dictofFiles) to create a new dictionary (dict_of_file_lines) of arrays that represent the lines of each file from the origianal input dictionary (dictofFiles)

    Parameters
    ----------
    verbose: boolean
        Tells function if it should output print statements or not. True outputs print statements.
    
    dictofFiles: dictionary of strings 
        Each key is what the string is for like inputdata being the input metadata file. The element of the each key is a file path/name 

    Returns
    -------
    dict_of_file_lines: ditioary of arrays of strings
        The dictionary has the keys that have some value in dictofFiles. The elements are an array of the lines of the file the key corrisponds to.
    
     
    '''
    #dictOfReturnValues = {"inputdata":None, "keep":None, "control":None, "case":None, "nullvalues":None, "match":None}
    dict_of_file_lines = {}
    for key in dictofFiles:
        if dictofFiles[key] is None:
            continue
        if key == "inputdata":
            #read metadata file into metadata object
            if verbose:
                print("metadata file path entered is %s"%(dictofFiles[key]))
            try:
                dict_of_file_lines[key] = Metadata.load(dictofFiles[key])
            except:
                raise ValueError('metadata file could not load. The file must be a TSV metadata file.')
        else:
            if verbose:
                    print("file path entered is %s"%(dictofFiles[key]))
            try:
                dict_of_file_lines[key] = open('./%s'%(dictofFiles[key]),'r').readlines()
            except:
                raise ValueError('File could not be opened') 
    return dict_of_file_lines
示例#20
0
def get_mf(metadata):
    """Convert a file or object from QIIME 2 metadata to a dataframe.

    This method automatically detects the type of input metadata and
    then converts it to a pandas.DataFrame object.

    Parameters
    ----------
    metadata : str or qiime2.Metadata
        Metadata file or object.
 
    Returns
    -------
    pandas.DataFrame
        DataFrame object containing metadata.

    Examples
    --------
    This is a simple example.

    >>> mf = dokdo.get_mf('/Users/sbslee/Desktop/dokdo/data/moving-pictures-tutorial/sample-metadata.tsv')
    >>> mf.head()
              barcode-sequence  body-site  ...  reported-antibiotic-usage  days-since-experiment-start
    sample-id                              ...
    L1S8          AGCTGACTAGTC        gut  ...                        Yes                          0.0
    L1S57         ACACACTATGGC        gut  ...                         No                         84.0
    L1S76         ACTACGTGTGGT        gut  ...                         No                        112.0
    L1S105        AGTGCGATGCGT        gut  ...                         No                        140.0
    L2S155        ACGATGCGACCA  left palm  ...                         No                         84.0
    """
    if isinstance(metadata, str):
        mf = Metadata.load(metadata).to_dataframe()
    elif isinstance(metadata, Metadata):
        mf = metadata.to_dataframe()
    else:
        raise TypeError(f"Incorrect metadata type: {type(metadata)}")
    return mf
示例#21
0
    def setUp(self):
        super().setUp()

        # Just for reference for anyone reading this, self.plugin is set upon
        # calling super().setUp() which looks at the "package" variable set
        # above
        self.plot = self.plugin.visualizers["plot"]

        # Load the various input QZAs/etc. needed to run this test
        prefixdir = os.path.join("docs", "moving-pictures")
        self.tree = Artifact.load(os.path.join(prefixdir, "rooted-tree.qza"))
        self.table = Artifact.load(os.path.join(prefixdir, "table.qza"))
        self.md = Metadata.load(os.path.join(prefixdir, "sample_metadata.tsv"))

        # We have to transform the taxonomy QZA to Metadata ourselves
        self.taxonomy = Artifact.load(os.path.join(prefixdir, "taxonomy.qza"))
        self.fmd = self.taxonomy.view(Metadata)

        # Helps us distinguish between if the test was successful or not
        self.result = None

        # If the test was successful, we'll save the output QZV to this path
        # during tearDown().
        self.output_path = os.path.join(prefixdir, "empress-tree.qzv")
示例#22
0
def _10(ff: ErrorCorrectionDetailsFmt) -> Metadata:
    return Metadata.load(str(ff))
    elif currentArgument in ("-n", "--nullValues"):
        user_input_file_null_values = currentValue
    elif currentArgument in ("-m", "--match"):
        user_input_file_name_match = currentValue
    elif currentArgument in ("-o", "--output"):
        outputFileName = currentValue

if outputFileName == '':
    print('output put file name not entered')
    sys.exit()
if file_of_metadata == '':
    print('metadata file not found')
    sys.exit()
#read metadata file into metadata object
try:
    originalMD = Metadata.load(file_of_metadata)
except:
    print(
        'metadata file could not load. If you entered a valid path then try clearing the formating. The file must be a TSV metadata file.'
    )
    print("metadata file path entered is %s" % (file_of_metadata))
    sys.exit()

#each line is a sqlite query to determine what samples to keep
exclude_query_lines_input = get_user_input_query_lines(
    user_input_file_name_exclude)
#each line is a sqlite query to determine what samples to label control
control_query_lines_input = get_user_input_query_lines(
    user_input_file_name_control)
#each line is a sqlite query to determine what samples to label case
case_query_lines_input = get_user_input_query_lines(
        print("age_years columns don't match")
        print(csvdata["age_years"])
        print(csvdata_match["age_years"])
    try:
        assert_frame_equal(csvdata, csvdata_match)

    except:
        return False
    return True


def test_Everything(verbose, inputdata, keep, control, case, nullvalues, match,
                    output, csvdata_keep, csvdata_case_control, csvdata_filter,
                    csvdata_match):

    csvdata_keep = Metadata.load(csvdata_keep).to_dataframe()
    csvdata_case_control = Metadata.load(csvdata_case_control).to_dataframe()
    csvdata_filter = Metadata.load(csvdata_filter).to_dataframe()
    csvdata_match = Metadata.load(csvdata_match).to_dataframe()

    tstart = time.clock()
    inputDict = {
        "inputdata": inputdata,
        "keep": keep,
        "control": control,
        "case": case,
        "nullvalues": nullvalues,
        "match": match
    }
    #loads and opens input files
    inputDict = match_controls.get_user_input_query_lines(verbose, inputDict)
示例#25
0
def _9(ff: ErrorCorrectionDetailsFmt) -> pd.DataFrame:
    return Metadata.load(str(ff)).to_dataframe()
示例#26
0
文件: tools.py 项目: qiime2/q2cli
def cast_metadata(paths, cast, output_file, ignore_extra,
                  error_on_missing):
    import tempfile
    from qiime2 import Metadata, metadata

    md = _merge_metadata(paths)

    cast_dict = {}
    try:
        for casting in cast:
            if ':' not in casting:
                raise click.BadParameter(
                    message=f'Missing `:` in --cast {casting}',
                    param_hint='cast')
            splitter = casting.split(':')
            if len(splitter) != 2:
                raise click.BadParameter(
                    message=f'Incorrect number of fields in --cast {casting}.'
                            f' Observed {len(splitter)}'
                            f' {tuple(splitter)}, expected 2.',
                    param_hint='cast')
            col, type_ = splitter
            if col in cast_dict:
                raise click.BadParameter(
                    message=(f'Column name "{col}" appears in cast more than'
                             ' once.'),
                    param_hint='cast')
            cast_dict[col] = type_
    except Exception as err:
        header = \
            ('Could not parse provided cast arguments into unique COLUMN:TYPE'
             ' pairs. Please make sure all cast flags are of the format --cast'
             ' COLUMN:TYPE')
        q2cli.util.exit_with_error(err, header=header)

    types = set(cast_dict.values())
    if not types.issubset(_COLUMN_TYPES):
        raise click.BadParameter(
            message=('Unknown column type provided. Please make sure all'
                     ' columns included in your cast contain a valid column'
                     ' type. Valid types: %s' %
                     (', '.join(_COLUMN_TYPES))),
            param_hint='cast')

    column_names = set(md.columns.keys())
    cast_names = set(cast_dict.keys())

    if not ignore_extra:
        if not cast_names.issubset(column_names):
            cast = cast_names.difference(column_names)
            raise click.BadParameter(
                message=('The following cast columns were not found'
                         ' within the metadata: %s' %
                         (', '.join(cast))),
                param_hint='cast')

    if error_on_missing:
        if not column_names.issubset(cast_names):
            cols = column_names.difference(cast_names)
            raise click.BadParameter(
                message='The following columns within the metadata'
                        ' were not provided in the cast: %s' %
                        (', '.join(cols)),
                param_hint='cast')

    # Remove entries from the cast dict that are not in the metadata to avoid
    # errors further down the road
    for cast in cast_names:
        if cast not in column_names:
            cast_dict.pop(cast)

    with tempfile.NamedTemporaryFile() as temp:
        md.save(temp.name)
        try:
            cast_md = Metadata.load(temp.name, cast_dict)
        except metadata.io.MetadataFileError as e:
            raise click.BadParameter(message=e, param_hint='cast') from e

    if output_file:
        cast_md.save(output_file)
    else:
        with tempfile.NamedTemporaryFile(mode='w+') as stdout_temp:
            cast_md.save(stdout_temp.name)
            stdout_str = stdout_temp.read()
            click.echo(stdout_str)
示例#27
0
def load_mp_data(use_artifact_api=True, is_empire=True):
    """Loads data from the QIIME 2 moving pictures tutorial for visualization.

    It's assumed that this data is already stored in docs/moving-pictures/, aka
    the PREFIX_DIR global variable set above, which should be located relative
    to where this function is being run from. If this directory or the data
    files within it cannot be accessed, this function will (probably) break.

    Parameters
    ----------
    use_artifact_api: bool, optional (default True)
        If True, this will load the artifacts using the QIIME 2 Artifact API,
        and the returned objects will have types corresponding to the first
        listed types (before the | characters) shown below.
        If False, this will instead load the artifacts without using QIIME 2's
        APIs; in this case, the returned objects will have types corresponding
        to the second listed types (after the | characters) shown below.
    is_empire: bool, optional(default True)
        If True, this will return an ordination.
        If False, will return None in place of an ordination.

    Returns
    -------
    (tree, table, md, fmd, ordination)
        tree: qiime2.Artifact | skbio.tree.TreeNode
            Phylogenetic tree.
        table: qiime2.Artifact | biom.Table
            Feature table.
        md: qiime2.Metadata | pandas.DataFrame
            Sample metadata.
        fmd: qiime2.Metadata | pandas.DataFrame
            Feature metadata. (Although this is stored in the repository as a
            FeatureData[Taxonomy] artifact, we transform it to Metadata if
            use_artifact_api is True.)
        pcoa: qiime2.Artifact | skbio.OrdinationResults | None
    """
    q2_tree_loc = os.path.join(PREFIX_DIR, "rooted-tree.qza")
    q2_table_loc = os.path.join(PREFIX_DIR, "table.qza")
    q2_pcoa_loc = os.path.join(PREFIX_DIR,
                               "unweighted_unifrac_pcoa_results.qza")
    q2_tax_loc = os.path.join(PREFIX_DIR, "taxonomy.qza")
    md_loc = os.path.join(PREFIX_DIR, "sample_metadata.tsv")
    if use_artifact_api:
        from qiime2 import Artifact, Metadata

        tree = Artifact.load(q2_tree_loc)
        table = Artifact.load(q2_table_loc)
        pcoa = Artifact.load(q2_pcoa_loc) if is_empire else None
        md = Metadata.load(md_loc)
        # We have to transform the taxonomy QZA to Metadata ourselves
        fmd = Artifact.load(q2_tax_loc).view(Metadata)
    else:
        import biom
        import pandas as pd
        from skbio.stats.ordination import OrdinationResults
        from skbio.tree import TreeNode
        with tempfile.TemporaryDirectory() as _tmp:
            tree_loc = extract_q2_artifact_to_path(_tmp, q2_tree_loc,
                                                   "tree.nwk")
            tree = TreeNode.read(tree_loc)
            tbl_loc = extract_q2_artifact_to_path(_tmp, q2_table_loc,
                                                  "feature-table.biom")
            table = biom.load_table(tbl_loc)
            if is_empire:
                pcoa_loc = extract_q2_artifact_to_path(_tmp, q2_pcoa_loc,
                                                       "ordination.txt")
                pcoa = OrdinationResults.read(pcoa_loc)
            else:
                pcoa = None
            tax_loc = extract_q2_artifact_to_path(_tmp, q2_tax_loc,
                                                  "taxonomy.tsv")
            fmd = pd.read_csv(tax_loc, sep="\t", index_col=0)
            md = pd.read_csv(md_loc, sep="\t", index_col=0, skiprows=[1])
    return tree, table, md, fmd, pcoa
示例#28
0
# - host_ids: a list of host subject IDs in the metadata file to subset the
#   sinks to
#
# Outputs: Three modified metadata files, where each file includes just:
# - 300 "source" samples
# - All "sink" samples of a specified empo_3 category
# Furthermore, a "SourceSink" column will be added to each metadata file and
# set accordingly.

from qiime2 import Metadata
from collections import Counter

host_ids = ["host1", "host2"]

print("loading metadata...")
df = Metadata.load("smooshed-metadata.txt").to_dataframe()
agp_sample_ids = set(df.loc[df.index.str.startswith("10317.")].index)

# 1. Construct a "Source" from each of the 3 AGP empo3 vals
agp_ids_to_use = []
for e in ["Animal distal gut", "Animal secretion", "Animal surface"]:
    # save the df subsets so we only have to do this once
    empo3subset = df[df["empo_3"] == e]

    agp_from_this_empo3 = set(empo3subset.index) & agp_sample_ids

    # Sort the list of IDs then take the first 100
    agp_empo3_subset = sorted(agp_from_this_empo3)[:100]
    agp_ids_to_use += agp_empo3_subset

# TODO subset the sinks as well? We could theoretically parallelize this across
示例#29
0
def run_integration_test(
    input_dir_name,
    output_dir_name,
    ranks_name,
    table_name,
    sample_metadata_name,
    feature_metadata_name=None,
    use_q2=False,
    q2_ranking_tool="songbird",
    expected_unsupported_samples=0,
    expected_unsupported_features=0,
    expect_all_unsupported_samples=False,
    q2_table_biom_format="BIOMV210Format",
    extreme_feature_count=None,
):
    """Runs qurro, and validates the output somewhat.

       Note that this is a pretty outdated function (as in, it doesn't support
       checking many of the corner cases/etc. that happen when running Qurro).
       The main purpose of this function is just checking at a high level that
       things look good, and that data is faithfully represented in the output
       main.js file.
    """

    in_dir = os.path.join("qurro", "tests", "input", input_dir_name)
    rloc = os.path.join(in_dir, ranks_name)
    tloc = os.path.join(in_dir, table_name)
    sloc = os.path.join(in_dir, sample_metadata_name)
    floc = None
    if feature_metadata_name is not None:
        floc = os.path.join(in_dir, feature_metadata_name)
    out_dir = os.path.join("docs", "demos", output_dir_name)

    rrv_qzv = result = None
    if use_q2:
        if q2_ranking_tool == "songbird":
            q2_action = q2qurro.actions.differential_plot
            q2_rank_type = "FeatureData[Differential]"
        elif q2_ranking_tool == "DEICODE":
            q2_action = q2qurro.actions.loading_plot
            q2_rank_type = "PCoAResults % Properties(['biplot'])"
        else:
            raise ValueError(
                "Unknown q2_ranking_tool: {}".format(q2_ranking_tool)
            )
        # Import all of these files as Q2 artifacts or metadata.
        rank_qza = Artifact.import_data(q2_rank_type, rloc)
        table_qza = Artifact.import_data(
            "FeatureTable[Frequency]", tloc, view_type=q2_table_biom_format
        )
        sample_metadata = Metadata.load(sloc)
        feature_metadata = None
        if floc is not None:
            feature_metadata = Metadata.load(floc)

        # Now that everything's imported, try running qurro
        rrv_qzv = q2_action(
            ranks=rank_qza,
            table=table_qza,
            sample_metadata=sample_metadata,
            feature_metadata=feature_metadata,
            extreme_feature_count=extreme_feature_count,
        )
        # Output the contents of the visualization to out_dir.
        rrv_qzv.visualization.export_data(out_dir)
    else:
        # Run qurro "standalone" -- i.e. outside of QIIME 2
        runner = CliRunner()
        args = [
            "--ranks",
            rloc,
            "--table",
            tloc,
            "--sample-metadata",
            sloc,
            "--output-dir",
            out_dir,
        ]
        if floc is not None:
            args += ["--feature-metadata", floc]
        if extreme_feature_count is not None:
            args += ["--extreme-feature-count", extreme_feature_count]
        result = runner.invoke(rrvp.plot, args)
        # Validate that the correct exit code and output were recorded
        validate_standalone_result(
            result,
            expected_unsupported_samples=expected_unsupported_samples,
            expect_all_unsupported_samples=expect_all_unsupported_samples,
            expected_unsupported_features=expected_unsupported_features,
        )
    # If we expected this test to fail due to invalid inputs, don't bother
    # doing any JSON validation.
    # (Input validity checking is done in generate.process_input(), before
    # any output files are created in generate.gen_visualization() -- so no
    # output should be created anyway in these cases.)
    if expect_all_unsupported_samples or expected_unsupported_features > 0:
        return None, None
    else:
        # Only validate JSONs if -x wasn't specified (i.e. the passed
        # extreme feature count is None)
        validate_jsons = extreme_feature_count is None
        rank_json, sample_json, count_json = validate_main_js(
            out_dir, rloc, tloc, sloc, validate_jsons=validate_jsons
        )
        return rank_json, sample_json, count_json
示例#30
0
def prepare_lefse(table_file,
                  taxonomy_file,
                  metadata_file,
                  output_file,
                  class_col,
                  subclass_col=None,
                  subject_col=None,
                  where=None):
    """Create a TSV file which can be used as input for the LEfSe tool.

    This command
    1) collapses the input feature table at the genus level,
    2) computes relative frequency of the features,
    3) performs sample filtration if requested,
    4) changes the format of feature names,
    5) adds the relevant metadata as 'Class', 'Subclass', and 'Subject', and
    6) writes a text file which can be used as input for LEfSe.

    Parameters
    ----------
    table_file : str
        Path to the table file with the 'FeatureTable[Frequency]' type.
    taxonomy_file : str
        Path to the taxonomy file with the 'FeatureData[Taxonomy]' type.
    metadata_file : str
        Path to the metadata file.
    output_file : str
        Path to the output file.
    class_col : str
        Metadata column used as 'Class' by LEfSe.
    subclass_col : str, optional
        Metadata column used as 'Subclass' by LEfSe.
    subject_col : str, optional
        Metadata column used as 'Subject' by LEfSe.
    where : str, optional
        SQLite 'WHERE' clause specifying sample metadata criteria.
    """
    _ = taxa.methods.collapse(table=Artifact.load(table_file),
                              taxonomy=Artifact.load(taxonomy_file),
                              level=6)

    _ = feature_table.methods.relative_frequency(table=_.collapsed_table)

    if where is None:
        df = _.relative_frequency_table.view(pd.DataFrame)
    else:
        _ = feature_table.methods.filter_samples(
            table=_.relative_frequency_table,
            metadata=Metadata.load(metadata_file),
            where=where)
        df = _.filtered_table.view(pd.DataFrame)

    def f(x):
        for c in ['-', '[', ']', '(', ')', ' ']:
            x = x.replace(c, '_')

        ranks = x.split(';')
        base = ranks[0]
        result = [base]

        for i, rank in enumerate(ranks[1:], start=2):
            if rank == '__':
                result.append(f'{base}_x__L{i}')
            elif rank.split('__')[1] == '':
                result.append(f'{base}_{rank}L{i}')
            else:
                result.append(rank)
                base = rank

        return '|'.join(result)

    df.columns = [f(x) for x in df.columns.to_list()]

    mf = dokdo.get_mf(metadata_file)
    mf = mf.replace(' ', '_', regex=True)
    cols = mf.columns.to_list()
    df = pd.concat([df, mf], axis=1, join="inner")
    df.insert(0, class_col, df.pop(class_col))
    cols.remove(class_col)

    if subclass_col is None and subject_col is None:
        pass
    elif subclass_col is not None and subject_col is None:
        df.insert(1, subclass_col, df.pop(subclass_col))
        cols.remove(subclass_col)
    elif subclass_col is None and subject_col is not None:
        df.insert(1, subject_col, df.pop(subject_col))
        cols.remove(subject_col)
    else:
        df.insert(1, subclass_col, df.pop(subclass_col))
        df.insert(2, subject_col, df.pop(subject_col))
        cols.remove(subclass_col)
        cols.remove(subject_col)

    df.drop(columns=cols, inplace=True)
    df.T.to_csv(output_file, header=False, sep='\t')