Exemplo n.º 1
0
 def test_save_biom(self):
     # NOTE: Currently not testing the save biom hdf with taxonomy
     # as there is a bug there!
     exp = ca.read_amplicon(self.test1_biom,
                            self.test1_samp,
                            normalize=None,
                            min_reads=None)
     d = mkdtemp()
     f = join(d, 'test1.save.biom')
     # test the json biom format
     exp.save_biom(f, fmt='hdf5')
     newexp = ca.read_amplicon(f,
                               self.test1_samp,
                               normalize=None,
                               min_reads=None)
     assert_experiment_equal(newexp, exp)
     # test the txt biom format
     exp.save_biom(f, fmt='txt')
     newexp = ca.read_amplicon(f,
                               self.test1_samp,
                               normalize=None,
                               min_reads=None)
     assert_experiment_equal(newexp, exp, ignore_md_fields=['taxonomy'])
     # test the hdf5 biom format with no taxonomy
     exp.save_biom(f, add_metadata=None)
     newexp = ca.read(f, self.test1_samp, normalize=None)
     self.assertTrue('taxonomy' not in newexp.feature_metadata)
     assert_experiment_equal(newexp, exp, ignore_md_fields=['taxonomy'])
     shutil.rmtree(d)
Exemplo n.º 2
0
 def setUp(self):
     super().setUp()
     # load the simple experiment as sparse
     self.test1 = ca.read(self.test1_biom, self.test1_samp, normalize=None)
     # load the complex experiment as sparse with normalizing and removing low read samples
     self.complex = ca.read_amplicon(self.timeseries_biom, self.timeseries_samp,
                                     filter_reads=1000, normalize=10000)
Exemplo n.º 3
0
 def setUp(self):
     super().setUp()
     self.mock_db = MockDatabase()
     self.test1 = ca.read_amplicon(self.test1_biom,
                                   self.test1_samp,
                                   filter_reads=1000,
                                   normalize=10000)
     self.s1 = 'TACGTATGTCACAAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGTGGATTAAGCGTGTTGTGAAATGTAGACGCTCAACGTCTGAATCGCAGCGCGAACTGGTTCACTTGAGTATGCACAACGTAGGCGGAATTCGTCG'
Exemplo n.º 4
0
 def test_sort_by_data_feature(self):
     obs = self.timeseries.sort_by_data(axis=1)
     exp = ca.read_amplicon(join(self.test_data_dir,
                                 'timeseries.sorted.freq.biom'),
                            join(self.test_data_dir, 'timeseries.sample'),
                            normalize=None,
                            min_reads=0)
     self.assert_experiment_equal(obs, exp, almost_equal=True)
Exemplo n.º 5
0
 def test_read_amplicon(self):
     # test loading a taxonomy biom table and filtering/normalizing
     exp1 = ca.read_amplicon(self.test1_biom, min_reads=1000, normalize=10000)
     exp2 = ca.read(self.test1_biom, normalize=None)
     exp2.filter_by_data('abundance', axis=0, cutoff=1000, inplace=True, mean_or_sum='sum')
     exp2.normalize(inplace=True)
     assert_experiment_equal(exp1, exp2)
     self.assertIn('taxonomy', exp1.feature_metadata.columns)
Exemplo n.º 6
0
    def __init__(self, load_exp=None):
        '''Start the gui and load data if supplied

        Parameters
        ----------
        load_exp : list of (table_file_name, map_file_name, study_name) or None (optional)
            load the experiments in the list upon startup
        '''
        super().__init__()
        # load the gui
        uic.loadUi(get_ui_file_name('CalourGUI.ui'), self)

        # handle button clicks
        self.wLoad.clicked.connect(self.load)
        self.wPlot.clicked.connect(self.plot)

        # the experiment list right mouse menu
        self.wExperiments.setContextMenuPolicy(QtCore.Qt.CustomContextMenu)
        self.wExperiments.customContextMenuRequested.connect(
            self.listItemRightClicked)

        # add functions
        # init the action group list
        action_groups = ['sample', 'feature', 'analysis']
        self.actions = {}
        for caction in action_groups:
            self.actions[caction] = {}

        # Add 'sample' buttons
        sample_buttons = [
            'Sort', 'Filter', 'Cluster', 'Join fields',
            'Filter by original reads', 'Normalize', 'Merge'
        ]
        self.add_buttons('sample', sample_buttons)

        feature_buttons = [
            'Cluster', 'Filter min reads', 'Filter taxonomy', 'Filter fasta',
            'Sort abundance'
        ]
        self.add_buttons('feature', feature_buttons)

        analysis_buttons = ['Diff. abundance', 'Correlation']
        self.add_buttons('analysis', analysis_buttons)

        # load experiments supplied
        if load_exp is not None:
            for cdata in load_exp:
                study_name = cdata[2]
                if study_name is None:
                    study_name = cdata[0]
                exp = ca.read_amplicon(cdata[0],
                                       cdata[1],
                                       normalize=10000,
                                       filter_reads=1000)
                exp._studyname = study_name
                self.addexp(exp)

        self.show()
Exemplo n.º 7
0
def main(argv):
    parser = argparse.ArgumentParser(
        description='metaanalysis cross-classifier',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-i', '--input', help='name of input biom table')
    parser.add_argument('-m', '--map', help='name of input mapping file')
    parser.add_argument('-o', '--output', help='name of output file')

    parser.add_argument(
        '--use-subset',
        help=
        "Use only subset of features present in both cohorts for classifier training",
        action='store_true',
        default=True)
    parser.add_argument('--shuffle',
                        help="Shuffle testing labels",
                        action='store_true',
                        default=False)
    parser.add_argument('--shuffle-source',
                        help="Shuffle training labels",
                        action='store_true',
                        default=False)
    parser.add_argument('--uname',
                        help="add unique signature to name",
                        action='store_true',
                        default=False)

    args = parser.parse_args(argv)

    cname = args.output
    if args.uname:
        cname += '_'
        cname = cname + str(uuid.uuid1())
        # cname += str(int(time.time() * 1000000))
    print('started processing file %s' % cname)

    # load the experiment
    print('loading experiment %s' % args.input)
    exp = ca.read_amplicon(args.input,
                           args.map,
                           min_reads=1000,
                           normalize=10000)
    print(exp)

    # run the classifier
    print('running the classifier')
    resdf_roc, resdf_accuracy = classifier_performance_matrix(
        exp=exp,
        use_subset_features=args.use_subset,
        shuffle=args.shuffle,
        shuffle_source=args.shuffle_source)

    # save the results
    print('saving to %s' % cname)
    resdf_roc.to_csv(cname + '_roc' + '.csv')
    resdf_accuracy.to_csv(cname + '_accuracy' + '.csv')
Exemplo n.º 8
0
 def test_read_amplicon(self):
     # test loading a taxonomy biom table and filtering/normalizing
     exp = ca.read_amplicon(self.test1_biom,
                            filter_reads=1000,
                            normalize=10000)
     exp2 = ca.read(self.test1_biom, normalize=None)
     exp2.filter_by_data('sum_abundance', cutoff=1000, inplace=True)
     exp2.normalize(inplace=True)
     assert_experiment_equal(exp, exp2)
     self.assertIn('taxonomy', exp.feature_metadata)
Exemplo n.º 9
0
 def load(self):
     win = LoadWindow()
     res = win.exec_()
     if res == QtWidgets.QDialog.Accepted:
         tablefname = str(win.wTableFile.text())
         mapfname = str(win.wMapFile.text())
         if mapfname == '':
             mapfname = None
         gnpsfname = str(win.wGNPSFile.text())
         if gnpsfname == '':
             gnpsfname = None
         expname = str(win.wNewName.text())
         exptype = str(win.wType.currentText())
         if exptype == 'Amplicon':
             try:
                 expdat = ca.read_amplicon(tablefname,
                                           mapfname,
                                           normalize=10000,
                                           filter_reads=1000)
             except:
                 logger.warn(
                     'Load for amplicon biom table %s map %s failed' %
                     (tablefname, mapfname))
                 return
         elif exptype == 'Metabolomics (rows are samples)':
             try:
                 expdat = ca.read_open_ms(tablefname,
                                          mapfname,
                                          gnps_file=gnpsfname,
                                          normalize=None,
                                          rows_are_samples=True)
             except:
                 logger.warn('Load for openms table %s map %s failed' %
                             (tablefname, mapfname))
                 return
         elif exptype == 'Metabolomics (rows are features)':
             try:
                 expdat = ca.read_open_ms(tablefname,
                                          mapfname,
                                          gnps_file=gnpsfname,
                                          normalize=None,
                                          rows_are_samples=False)
             except:
                 logger.warn('Load for openms table %s map %s failed' %
                             (tablefname, mapfname))
                 return
         elif exptype == 'Amplicon':
             try:
                 expdat = ca.read(tablefname, mapfname)
             except:
                 logger.warn('Load for biom table %s map %s failed' %
                             (tablefname, mapfname))
                 return
         expdat._studyname = expname
         self.addexp(expdat)
Exemplo n.º 10
0
def read_qiime2(data_file,
                sample_metadata_file=None,
                feature_metadata_file=None,
                rep_seqs_file=None,
                **kwargs):
    '''Read a qiime2 generated table (even if it was run without the --p-no-hashedfeature-ids flag)
    This is a wrapper for calour.read_amplicon(), that can unzip and extract biom table, feature metadata, rep_seqs_file qza files generated by qiime2

    Parameters
    ----------
    data_file: str
        name of qiime2 deblur/dada2 generated feature table qza or biom table
    sample_metadata_file: str or None, optional
        name of tab separated mapping file
    feature_metadata_file: str or None, optional
        can be the taxonomy qza or tsv generated by qiime2 feature classifier
    rep_seqs_file: str or None, optional
        if not none, name of the qiime2 representative sequences qza file (the --o-representative-sequences file name in qiime2 dada2/deblur)
    **kwargs:
        to be passed to calour.read_amplicon

    Returns
    -------
    calour.AmpliconExperiment
    '''
    import tempfile

    with tempfile.TemporaryDirectory() as tempdir:
        data_file = filename_from_zip(tempdir, data_file,
                                      'data/feature-table.biom')
        feature_metadata_file = filename_from_zip(tempdir,
                                                  feature_metadata_file,
                                                  'data/taxonomy.tsv')
        rep_seqs_file = filename_from_zip(tempdir, rep_seqs_file,
                                          'data/dna-sequences.fasta')
        expdat = ca.read_amplicon(data_file,
                                  sample_metadata_file=sample_metadata_file,
                                  feature_metadata_file=feature_metadata_file,
                                  **kwargs)
        if rep_seqs_file is not None:
            seqs = []
            with open(rep_seqs_file) as rsf:
                for cline in rsf:
                    # take the sequence from the header
                    if cline[0] != '>':
                        continue
                    seqs.append(cline[1:])
            expdat.feature_metadata['_orig_id'] = expdat.feature_metadata[
                '_feature_id']
            expdat.feature_metadata['_feature_id'] = seqs
            expdat.feature_metadata = expdat.feature_metadata.set_index(
                '_feature_id')

    return expdat
Exemplo n.º 11
0
 def setUp(self):
     super().setUp()
     self.test1 = ca.read_amplicon(self.test1_biom,
                                   self.test1_samp,
                                   min_reads=1000,
                                   normalize=10000)
    "age_prediction/gut_4575/gut_4575_rare_map__cohort_cantonese_sex_female__.txt",
    "age_prediction/gut_4575/gut_4575_rare_map__cohort_cantonese_sex_male__.txt"
]

distmatrix_fp = [
    '82-soil/beta-q2/', 'PMI_16s/beta-q2/', 'malnutrition/beta-q2/',
    'cider/beta-q2/'
]

# In[8]:

if (balances):
    feature_datatype = 'qiime2'
    exp = ca.read_amplicon(biom_fp[dataset],
                           metadata_fp[dataset],
                           data_file_type='qiime2',
                           min_reads=None,
                           normalize=None)
else:  #BIOM table input
    exp = ca.read_amplicon(biom_fp[dataset],
                           metadata_fp[dataset],
                           min_reads=None,
                           normalize=None)
    #if (dataset!=3): exp = exp.filter_abundance(10)

# ## Modify parameter options by shape of data

# Create logarithmic scales for ranges of parameter options where valid inputs can be 1<->n_features or n_samples

# In[11]:
Exemplo n.º 13
0
 def setUp(self):
     super().setUp()
     self.pre_ratio = ca.read_amplicon(self.rat_pre_biom, self.rat_pre_samp,
                                       min_reads=10, normalize=None)
     self.ratio1 = ca.read(self.rat_biom, self.rat_samp, normalize=None, cls=RatioExperiment)
import calour as cl
import numpy as np

from scipy.stats import sem
import pickle
cl.set_log_level(40)

# input biom table and mapping file
cfs = cl.read_amplicon('../data/cfs.biom',
                       '../data/cfs.map.txt',
                       sparse=False,
                       normalize=10000,
                       min_reads=1000)

filtlev = [
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200,
    300, 400, 500
]
B = 1000

sig_ds_cfs = []
sig_bh_cfs = []
sig_fbh_cfs = []

err_bh_cfs = []
err_ds_cfs = []
err_fbh_cfs = []
for i in filtlev:
    print('filter level...: %s' % (i))

    sig_ds = []
    "fermentation_1976_beer/qiita_v2.txt",
    "fermentation_1976_wine/qiita_v2.txt",
    "infant_fecal_11402/filtered_metadata.tsv",
    "infant_fecal_10918/filtered_metadata.tsv",
    "infant_fecal_10080/filtered_metadata.tsv",
    "infant_fecal_11358/filtered_metadata.tsv",
    "infant_oral_2010/filtered_metadata.tsv",
    "infant_skin_2010/filtered_metadata.tsv"
]


# In[10]:


if(knn): 
    exp = ca.read_amplicon(dir_prefixes[dataset]+"/feature-table.biom", metadata_fp[dataset], 
                       min_reads=None, normalize=None)
else:
    exp = ca.read_amplicon(biom_fp[dataset], metadata_fp[dataset], 
                       min_reads=None, normalize=None)
print(exp)


# In[11]:


target = None
#Specify column to predict
if (dataset==0): #82-soil
    target = 'ph'
if (dataset==1):
    target = 'days_since_placement'
from scipy import stats
import scipy
import pickle
import time
import math
import inspect
import operator

pd.set_option('display.max_rows', 10000)

# # Importing data, no TSS normalization performed here since table is already normalized

# In[3]:

allMetab = ca.read_amplicon('PMI_MS1_FeatureTable_Normalized.biom',
                            'pmi3_metab_meta.txt',
                            min_reads=1,
                            normalize=None)

# ## Remove controls and samples that grouped with controls on PCoA

# In[4]:

allMetab = allMetab.filter_samples('control', 'n')
allMetab = allMetab.filter_samples('pcoa_removals', 'n')
allMetab.sample_metadata.description.value_counts()

# # Split by sampling location (soil v. skin)

# ## Skin sample filtering

# In[5]: