예제 #1
0
    def test_assert_ordination_results_equal(self):
        minimal1 = OrdinationResults([1, 2])

        # a minimal set of results should be equal to itself
        assert_ordination_results_equal(minimal1, minimal1)

        # type mismatch
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, 'foo')

        # numeric values should be checked that they're almost equal
        almost_minimal1 = OrdinationResults([1.0000001, 1.9999999])
        assert_ordination_results_equal(minimal1, almost_minimal1)

        # species_ids missing in one, present in the other
        almost_minimal1.species_ids = ['abc', 'def']
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, almost_minimal1)
        almost_minimal1.species_ids = None

        # site_ids missing in one, present in the other
        almost_minimal1.site_ids = ['abc', 'def']
        with npt.assert_raises(AssertionError):
            assert_ordination_results_equal(minimal1, almost_minimal1)
        almost_minimal1.site_ids = None

        # test each of the optional numeric attributes
        for attr in ('species', 'site', 'biplot', 'site_constraints',
                     'proportion_explained'):
            # missing optional numeric attribute in one, present in the other
            setattr(almost_minimal1, attr, [[1, 2], [3, 4]])
            with npt.assert_raises(AssertionError):
                assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(almost_minimal1, attr, None)

            # optional numeric attributes present in both, but not almost equal
            setattr(minimal1, attr, [[1, 2], [3, 4]])
            setattr(almost_minimal1, attr, [[1, 2], [3.00002, 4]])
            with npt.assert_raises(AssertionError):
                assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(minimal1, attr, None)
            setattr(almost_minimal1, attr, None)

            # optional numeric attributes present in both, and almost equal
            setattr(minimal1, attr, [[1, 2], [3, 4]])
            setattr(almost_minimal1, attr, [[1, 2], [3.00000002, 4]])
            assert_ordination_results_equal(minimal1, almost_minimal1)
            setattr(minimal1, attr, None)
            setattr(almost_minimal1, attr, None)
예제 #2
0
    def setUpClass(cls):
        axis_labels = ['PC1', 'PC2', 'PC3']
        cls.test_df1 = pd.DataFrame.from_dict(
            {
                's1': [0.1, 0.2, 7],
                's2': [0.9, 0.2, 7],
            },
            orient='index',
            columns=axis_labels,
        )
        cls.test_df1.index.name = 'Sample ID'
        cls.pcoa1 = OrdinationResults(
            'pcoa1',
            'pcoa1',
            eigvals=pd.Series(
                [7, 2, 1],
                index=axis_labels,
            ),
            samples=cls.test_df1,
            proportion_explained=pd.Series(
                [0.7, 0.2, 0.1],
                index=axis_labels,
            ),
        )

        cls.test_metadata = pd.DataFrame(
            {
                'age_cat': ['30s', '40s', '50s', '30s', None],
                'num_cat': [7.24, 7.24, 8.25, 7.24, None],
                'other': [1, 2, 3, 4, None],
            },
            index=pd.Series(['s1', 's2', 'c', 'd', 'e'], name='#SampleID'))
예제 #3
0
    def setUp(self):
        self.test_dm = DistanceMatrix(
            np.array([
                [0, 1, 2, 3, 4],
                [1, 0, 4, 5, 6],
                [2, 4, 0, 6, 7],
                [3, 5, 6, 0, 8],
                [4, 6, 7, 8, 0],
            ]),
            ids=[f'S{i}' for i in range(5)],
        )

        n_samples = 100
        np.random.seed(825)
        sample_embedding = np.random.normal(size=(n_samples, 3)) + 2
        sample_embedding[:, 1] *= 3
        sample_embedding[:, 2] *= 6
        sample_df = pd.DataFrame(
            sample_embedding,
            index=[f'S{i}' for i in range(n_samples)],
            columns=[f'C{i}' for i in range(3)],
        )

        self.test_ord_results = OrdinationResults(
            'foo',
            'bar',
            eigvals=pd.Series(np.arange(n_samples)),
            samples=sample_df,
        )
예제 #4
0
    def test_str(self):
        exp = ("Ordination results:\n"
               "\tEigvals: 2\n"
               "\tProportion explained: N/A\n"
               "\tSpecies: 3x2\n"
               "\tSite: 3x2\n"
               "\tBiplot: N/A\n"
               "\tSite constraints: N/A\n"
               "\tSpecies IDs: 'Species1', 'Species2', 'Species3'\n"
               "\tSite IDs: 'Site1', 'Site2', 'Site3'")
        obs = str(self.ordination_results)
        self.assertEqual(obs, exp)

        # all optional attributes missing
        exp = ("Ordination results:\n"
               "\tEigvals: 1\n"
               "\tProportion explained: N/A\n"
               "\tSpecies: N/A\n"
               "\tSite: N/A\n"
               "\tBiplot: N/A\n"
               "\tSite constraints: N/A\n"
               "\tSpecies IDs: N/A\n"
               "\tSite IDs: N/A")
        obs = str(OrdinationResults(np.array([4.2])))
        self.assertEqual(obs, exp)
예제 #5
0
    def test_get_procrustes_results(self):
        sample_id_map = {
            'CP3A1': 'S1',
            'CC1A1': 'S2',
            'CC2A1': 'S3',
            'CP1A1': 'S4'
        }
        actual = get_procrustes_results(StringIO(pcoa1_f),
                                        StringIO(pcoa1_f),
                                        sample_id_map=sample_id_map,
                                        randomize=None,
                                        max_dimensions=None)
        # just some sanity checks as the individual componenets are
        # already tested -- these are based on looking at the output of the
        # run, and testing to ensure that it hasn't changed
        eigvals = array([
            8976580.24393, 6044862.67619, 4372581.39431, 3161360.10319,
            2583594.45275, 2407555.39787
        ])
        prop_expl = array([
            23.1764657118, 15.6071186064, 11.2894866423, 8.16225689998,
            6.67053450426, 6.21602253997
        ])

        site = array([[
            -0.199225958574, -0.250846540029, -0.119813087305, -0.155652031006,
            0.18495315824, -0.160875399364
        ],
                      [
                          -0.238263544222, -0.37724227779, -0.169458651217,
                          0.0305157004776, 0.112181007345, 0.0677415967093
                      ],
                      [
                          0.116737988534, 0.414627960015, 0.201315243115,
                          0.113769076804, -0.283025353088, -0.144278863311
                      ],
                      [
                          0.320751514262, 0.213460857804, 0.0879564954067,
                          0.0113672537238, -0.0141088124974, 0.237412665966
                      ]])
        site_ids = ['S3', 'S2', 'S1', 'S4']
        expected = OrdinationResults(eigvals=eigvals,
                                     proportion_explained=prop_expl,
                                     site=site,
                                     site_ids=site_ids)

        assert_almost_equal(actual[0].eigvals, expected.eigvals)
        assert_almost_equal(actual[0].proportion_explained,
                            expected.proportion_explained)
        self.assertEqual(actual[0].site_ids, expected.site_ids)
        assert_almost_equal(actual[0].site, expected.site)

        assert_almost_equal(actual[1].eigvals, expected.eigvals)
        assert_almost_equal(actual[1].proportion_explained,
                            expected.proportion_explained)
        assert_almost_equal(actual[1].site, expected.site)
        self.assertEqual(actual[1].site_ids, expected.site_ids)

        self.assertTrue(actual[2] < 6e-30)
예제 #6
0
 def setUpClass(cls):
     axis_labels = ['PC1', 'PC2', 'PC3']
     cls.test_df1 = pd.DataFrame.from_dict(
         {
             's1': [0.1, 0.2, 7],
             's2': [0.9, 0.2, 7],
         },
         orient='index',
         columns=axis_labels,
     )
     cls.test_df1.index.name = 'Sample ID'
     cls.pcoa1 = OrdinationResults(
         'pcoa1',
         'pcoa1',
         eigvals=pd.Series(
             [7, 2, 1],
             index=axis_labels,
         ),
         samples=cls.test_df1,
         proportion_explained=pd.Series(
             [0.7, 0.2, 0.1],
             index=axis_labels,
         ),
     )
     cls.test_metadata = pd.DataFrame(
         {
             'age_cat': ['30s', '40s', '50s', '30s', None],
             'num_cat': [7.24, 7.24, 8.25, 7.24, None],
             'other': [1, 2, 3, 4, None],
         },
         index=pd.Series(['s1', 's2', 'c', 'd', 'e'], name='#SampleID'))
     cls.resources = DictElement({
         'datasets':
         DictElement({
             'dataset1':
             DictElement({
                 '__metadata__':
                 MockMetadataElement(cls.test_metadata),
                 '__pcoa__':
                 PCOAElement({
                     'sample_set':
                     DictElement({
                         'beta_metric': cls.pcoa1,
                     }),
                 })
             }),
             'dataset2':
             DictElement({
                 '__metadata__':
                 MockMetadataElement(cls.test_metadata),
             }),
         }),
     })
     cls.resources.accept(TrivialVisitor())
     cls.res_patcher = patch(
         'microsetta_public_api.api.emperor.get_resources')
     cls.mock_resources = cls.res_patcher.start()
     cls.mock_resources.return_value = cls.resources
예제 #7
0
    def setUp(self):
        # Define in-memory CA results to serialize and deserialize.
        eigvals = np.array([0.0961330159181, 0.0409418140138])
        species = np.array([[0.408869425742, 0.0695518116298],
                            [-0.1153860437, -0.299767683538],
                            [-0.309967102571, 0.187391917117]])
        site = np.array([[-0.848956053187, 0.882764759014],
                         [-0.220458650578, -1.34482000302],
                         [1.66697179591, 0.470324389808]])
        biplot = None
        site_constraints = None
        prop_explained = None
        species_ids = ['Species1', 'Species2', 'Species3']
        site_ids = ['Site1', 'Site2', 'Site3']

        self.ordination_results = OrdinationResults(
            eigvals=eigvals,
            species=species,
            site=site,
            biplot=biplot,
            site_constraints=site_constraints,
            proportion_explained=prop_explained,
            species_ids=species_ids,
            site_ids=site_ids)

        # DataFrame for testing plot method. Has a categorical column with a
        # mix of numbers and strings. Has a numeric column with a mix of ints,
        # floats, and strings that can be converted to floats. Has a numeric
        # column with missing data (np.nan).
        self.df = pd.DataFrame([['foo', '42', 10], [22, 0, 8],
                                [22, -4.2, np.nan], ['foo', '42.19', 11]],
                               index=['A', 'B', 'C', 'D'],
                               columns=['categorical', 'numeric', 'nancolumn'])

        # Minimal ordination results for easier testing of plotting method.
        # Paired with df above.
        eigvals = np.array([0.50, 0.25, 0.25])
        site = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5],
                         [0.4, 0.5, 0.6]])
        self.min_ord_results = OrdinationResults(eigvals=eigvals,
                                                 site=site,
                                                 site_ids=['A', 'B', 'C', 'D'])
예제 #8
0
파일: umap.py 프로젝트: biocore/q2-umap
def embed(
            distance_matrix: DistanceMatrix,
            n_neighbors: int,
            min_dist: float = 1,
            number_of_dimensions: int = 2,
            random_state: int = 724,
        ) -> OrdinationResults:

    n_samples = len(distance_matrix.ids)
    if number_of_dimensions > n_samples:
        raise ValueError(
            f'number_of_dimensions ({number_of_dimensions}) must be fewer than'
            f'number of samples ({n_samples}) - 2'
        )

    transformer = UMAP(
        n_neighbors=n_neighbors,
        n_components=number_of_dimensions,
        min_dist=min_dist,
        random_state=random_state,
        metric='precomputed',
    )

    embedding = transformer.fit_transform(distance_matrix[:, :])

    if embedding.shape[1] < 3:
        difference = 3 - embedding.shape[1]
        embedding = np.hstack((embedding, np.zeros((len(embedding),
                                                    difference))))

    number_of_dimensions = embedding.shape[1]

    embedding_df = pd.DataFrame(embedding, index=distance_matrix.ids,
                                columns=[f'UMAP-{i}' for i in
                                         range(embedding.shape[1])]
                                )

    null_eigvals = pd.Series(np.zeros(number_of_dimensions))
    ord_results = OrdinationResults(
        'umap',
        'Uniform Manifold Approximation and Projection',
        eigvals=null_eigvals,
        samples=embedding_df,
        proportion_explained=null_eigvals,
    )

    return center(ord_results)
예제 #9
0
def emperor_output(sklearn_output, full_file_list, eigenvalues, percent_variance, output_file, new_files = None):
    print("Made it to Emperor Function!")
    #read in sklearn output and format accordingly for emperor intake
    eigvals = pd.Series(data = eigenvalues)
    samples = pd.DataFrame(data = sklearn_output, index = full_file_list)
    p_explained = pd.Series(data = percent_variance)
    ores = OrdinationResults(long_method_name = "principal component analysis", short_method_name = "pcoa", eigvals = eigvals, samples = samples, proportion_explained = p_explained)
    
    #this first part is for the global metadata file
    global_metadata = pd.read_csv(config.PATH_TO_ORIGINAL_MAPPING_FILE, sep = "\t")
    global_metadata_headers = global_metadata.columns.tolist()
    global_metadata.rename(columns = {'filename': 'SampleID'}, inplace = True)
    global_metadata["type"] = "Global Data"
    global_metadata.set_index("SampleID", inplace = True)

    common = global_metadata    

    #this part is for the user uploaded metadata file
    if new_files != None:
        metadata_uploaded = pd.DataFrame({"SampleID": new_files, "type":["Your Data"] * len(new_files)})
        for item in global_metadata_headers:
            metadata_uploaded[item] = ["Your Data"] * len(new_files)
        metadata_uploaded.set_index("SampleID", inplace = True)
        
        common = pd.concat([global_metadata, metadata_uploaded])

   

    #so you need to align the metadata and the files contained within the ordination file BEFORE feeding it into the Emperor thing otherwise it doesn't like to output results  
    final_metadata, unused = common.align(samples, join = "right", axis = 0)
    
  
    #call stuff to ouput an emperor plot
    emp = Emperor(ores, final_metadata, remote = True)
           
    # create an output directory
    os.makedirs(output_file, exist_ok=True)

    with open(os.path.join(output_file, 'index.html'), 'w') as f:
        f.write(emp.make_emperor(standalone = True))
        emp.copy_support_files(output_file)
예제 #10
0
파일: umap.py 프로젝트: biocore/q2-umap
def center(embedding: OrdinationResults) -> OrdinationResults:
    short_name = embedding.short_method_name
    long_name = embedding.long_method_name
    n_dimensions = embedding.samples.shape[1]
    transformer = PCA(n_components=n_dimensions)
    new_embedding = transformer.fit_transform(embedding.samples)

    embedding_df = pd.DataFrame(new_embedding,
                                index=embedding.samples.index,
                                columns=embedding.samples.columns
                                )

    null_eigvals = pd.Series(np.zeros(n_dimensions))
    ord_results = OrdinationResults(
        short_name,
        long_name,
        eigvals=null_eigvals,
        samples=embedding_df,
        proportion_explained=null_eigvals,
    )
    return ord_results
예제 #11
0
def emperor_output(sklearn_output,
                   full_file_list,
                   eigenvalues,
                   percent_variance,
                   output_file,
                   new_files=[]):
    eigvals = pd.Series(data=eigenvalues)
    samples = pd.DataFrame(data=sklearn_output, index=full_file_list)
    samples.index.rename("SampleID", inplace=True)
    p_explained = pd.Series(data=percent_variance)
    ores = OrdinationResults(long_method_name="principal component analysis",
                             short_method_name="pcoa",
                             eigvals=eigvals,
                             samples=samples,
                             proportion_explained=p_explained)

    #read in all sample metadata
    df = pd.read_table(config.PATH_TO_ORIGINAL_MAPPING_FILE)
    df.rename(columns={"filename": "SampleID"}, inplace=True)
    df.set_index("SampleID", inplace=True)

    #handling the case in which the pca is a projection
    if len(new_files) != 0:
        df["Type"] = "Global"
        new_meta = pd.DataFrame({"SampleID": new_files, "Type": "Your Data"})
        new_meta.set_index("SampleID", inplace=True)
        df = pd.concat([df, new_meta], axis=0, join="outer")

    final_metadata, unused = df.align(samples, join="right", axis=0)

    #call stuff to ouput an emperor plot
    emp = Emperor(ores, final_metadata, remote=True)

    # create an output directory
    os.makedirs(output_file, exist_ok=True)

    with open(os.path.join(output_file, 'index.html'), 'w') as f:
        f.write(emp.make_emperor(standalone=True))
        emp.copy_support_files(output_file)
예제 #12
0
def _ordination_to_ordination_results(fh):
    eigvals = _parse_vector_section(fh, 'Eigvals')
    if eigvals is None:
        raise OrdinationFormatError("At least one eigval must be present.")
    _check_empty_line(fh)

    prop_expl = _parse_vector_section(fh, 'Proportion explained')
    _check_length_against_eigvals(prop_expl, eigvals,
                                  'proportion explained values')
    _check_empty_line(fh)

    species, species_ids = _parse_array_section(fh, 'Species')
    _check_length_against_eigvals(species, eigvals, 'coordinates per species')
    _check_empty_line(fh)

    site, site_ids = _parse_array_section(fh, 'Site')
    _check_length_against_eigvals(site, eigvals, 'coordinates per site')
    _check_empty_line(fh)

    # biplot does not have ids to parse (the other arrays do)
    biplot, _ = _parse_array_section(fh, 'Biplot', has_ids=False)
    _check_empty_line(fh)

    cons, cons_ids = _parse_array_section(fh, 'Site constraints')

    if cons_ids is not None and site_ids is not None:
        if cons_ids != site_ids:
            raise OrdinationFormatError(
                "Site constraints ids and site ids must be equal: %s != %s" %
                (cons_ids, site_ids))

    return OrdinationResults(eigvals=eigvals,
                             species=species,
                             site=site,
                             biplot=biplot,
                             site_constraints=cons,
                             proportion_explained=prop_expl,
                             species_ids=species_ids,
                             site_ids=site_ids)
예제 #13
0
def _ordination_to_ordination_results(fh):
    eigvals = _parse_vector_section(fh, 'Eigvals')
    if eigvals is None:
        raise OrdinationFormatError("At least one eigval must be present.")
    _check_empty_line(fh)

    prop_expl = _parse_vector_section(fh, 'Proportion explained')
    _check_length_against_eigvals(prop_expl, eigvals,
                                  'proportion explained values')
    _check_empty_line(fh)

    species = _parse_array_section(fh, 'Species')
    _check_length_against_eigvals(species, eigvals,
                                  'coordinates per species')
    _check_empty_line(fh)

    site = _parse_array_section(fh, 'Site')
    _check_length_against_eigvals(site, eigvals,
                                  'coordinates per site')
    _check_empty_line(fh)

    # biplot does not have ids to parse (the other arrays do)
    biplot = _parse_array_section(fh, 'Biplot', has_ids=False)
    _check_empty_line(fh)

    cons = _parse_array_section(fh, 'Site constraints')

    if cons is not None and site is not None:
        if not np.array_equal(cons.index, site.index):
            raise OrdinationFormatError(
                "Site constraints ids and site ids must be equal: %s != %s" %
                (cons.index, site.index))

    return OrdinationResults(
        short_method_name='', long_method_name='', eigvals=eigvals,
        features=species, samples=site, biplot_scores=biplot,
        sample_constraints=cons, proportion_explained=prop_expl)
    def setUp(self):
        super().setUp()
        axis_labels = ['PC1', 'PC2', 'PC3']
        self.resources = ResourceManager()
        self.fh1 = self.create_tempfile(suffix='.qza')
        self.fh2 = self.create_tempfile(suffix='.qza')
        self.pcoa_path1 = self.fh1.name
        self.pcoa_path2 = self.fh2.name
        self.test_df1 = pd.DataFrame.from_dict(
            {
                's1': [0.1, 0.2, 7],
                's2': [0.9, 0.2, 7],
            },
            orient='index',
            columns=axis_labels,
        )
        self.test_df1.index.name = 'Sample ID'
        self.test_df2 = pd.DataFrame.from_dict(
            {
                's1': [0.1, 0.2, 7],
                's2': [0.9, 0.2, 7],
                's3': [0.2, -0.3, 0],
                's4': [0.111, -4, 0.2],
            },
            orient='index',
            columns=axis_labels,
        )
        self.test_df2.index.name = 'Sample ID'

        self.pcoa1 = OrdinationResults(
            'pcoa1',
            'pcoa1',
            eigvals=pd.Series(
                [7, 2, 1],
                index=axis_labels,
            ),
            samples=self.test_df1,
            proportion_explained=pd.Series(
                [0.7, 0.2, 0.1],
                index=axis_labels,
            ),
        )
        self.pcoa2 = OrdinationResults(
            'pcoa2',
            'pcoa2',
            eigvals=pd.Series(
                [6, 3, 1],
                index=axis_labels,
            ),
            samples=self.test_df2,
            proportion_explained=pd.Series(
                [0.6, 0.3, 0.1],
                index=axis_labels,
            ),
        )
        imported_artifact = Artifact.import_data(
            "PCoAResults",
            self.pcoa1,
        )
        imported_artifact.save(self.pcoa_path1)
        imported_artifact = Artifact.import_data(
            "PCoAResults",
            self.pcoa2,
        )
        imported_artifact.save(self.pcoa_path2)
예제 #15
0
 def setUp(self):
     eigvals = np.array([0.512367260461, 0.300719094427, 0.267912066004,
                         0.208988681078, 0.19169895326, 0.16054234528,
                         0.15017695712, 0.122457748167, 0.0])
     site = np.array([[-0.212230626531, 0.216034194368, 0.03532727349,
                       -0.254450494129, -0.0687468542543, 0.231895596562,
                       0.00496549154314, -0.0026246871695,
                       9.73837390723e-10],
                      [-0.277487312135, -0.0295483215975, -0.0744173437992,
                       0.0957182357964, 0.204714844022, -0.0055407341857,
                       -0.190287966833, 0.16307126638, 9.73837390723e-10],
                      [0.220886492631, 0.0874848360559, -0.351990132198,
                       -0.00316535032886, 0.114635191853, -0.00019194106125,
                       0.188557853937, 0.030002427212, 9.73837390723e-10],
                      [0.0308923744062, -0.0446295973489, 0.133996451689,
                       0.29318228566, -0.167812539312, 0.130996149793,
                       0.113551017379, 0.109987942454, 9.73837390723e-10],
                      [0.27616778138, -0.0341866951102, 0.0633000238256,
                       0.100446653327, 0.123802521199, 0.1285839664,
                       -0.132852841046, -0.217514322505, 9.73837390723e-10],
                      [0.202458130052, -0.115216120518, 0.301820871723,
                       -0.18300251046, 0.136208248567, -0.0989435556722,
                       0.0927738484879, 0.0909429797672, 9.73837390723e-10],
                      [0.236467470907, 0.21863434374, -0.0301637746424,
                       -0.0225473129718, -0.205287183891, -0.180224615141,
                       -0.165277751908, 0.0411933458557, 9.73837390723e-10],
                      [-0.105517545144, -0.41405687433, -0.150073017617,
                       -0.116066751485, -0.158763393475, -0.0223918378516,
                       -0.0263068046112, -0.0501209518091,
                       9.73837390723e-10],
                      [-0.371636765565, 0.115484234741, 0.0721996475289,
                       0.0898852445906, 0.0212491652909, -0.184183028843,
                       0.114877153051, -0.164938000185, 9.73837390723e-10]])
     prop_expl = np.array([25.6216900347, 15.7715955926, 14.1215046787,
                           11.6913885817, 9.83044890697, 8.51253468595,
                           7.88775505332, 6.56308246609, 4.42499350906e-16])
     site_ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
                 'PC.355', 'PC.607', 'PC.634']
     self.ord_res = OrdinationResults(eigvals=eigvals, site=site,
                                      proportion_explained=prop_expl,
                                      site_ids=site_ids)
     metadata_map = {'PC.354': {'Treatment': 'Control',
                                'DOB': '20061218',
                                'Weight': '60',
                                'Description': 'Control_mouse_I.D._354'},
                     'PC.355': {'Treatment': 'Control',
                                'DOB': '20061218',
                                'Weight': '55',
                                'Description': 'Control_mouse_I.D._355'},
                     'PC.356': {'Treatment': 'Control',
                                'DOB': '20061126',
                                'Weight': '50',
                                'Description': 'Control_mouse_I.D._356'},
                     'PC.481': {'Treatment': 'Control',
                                'DOB': '20070314',
                                'Weight': '52',
                                'Description': 'Control_mouse_I.D._481'},
                     'PC.593': {'Treatment': 'Control',
                                'DOB': '20071210',
                                'Weight': '57',
                                'Description': 'Control_mouse_I.D._593'},
                     'PC.607': {'Treatment': 'Fast',
                                'DOB': '20071112',
                                'Weight': '65',
                                'Description': 'Fasting_mouse_I.D._607'},
                     'PC.634': {'Treatment': 'Fast',
                                'DOB': '20080116',
                                'Weight': '68',
                                'Description': 'Fasting_mouse_I.D._634'},
                     'PC.635': {'Treatment': 'Fast',
                                'DOB': '20080116',
                                'Weight': '70',
                                'Description': 'Fasting_mouse_I.D._635'},
                     'PC.636': {'Treatment': 'Fast',
                                'DOB': '20080116',
                                'Weight': '72',
                                'Description': 'Fasting_mouse_I.D._636'}}
     self.metadata_map = pd.DataFrame.from_dict(metadata_map,
                                                orient='index')
     self.categories = ['Treatment']
     self.sort_by = 'Weight'
예제 #16
0
    def setUp(self):
        super(OrdinationResultsReaderWriterTests, self).setUp()

        # define in-memory results, one for each of the valid files in
        # self.valid_fps

        # CA results
        eigvals = np.array([0.0961330159181, 0.0409418140138])
        species = np.array([[0.408869425742, 0.0695518116298],
                            [-0.1153860437, -0.299767683538],
                            [-0.309967102571, 0.187391917117]])
        site = np.array([[-0.848956053187, 0.882764759014],
                         [-0.220458650578, -1.34482000302],
                         [1.66697179591, 0.470324389808]])
        biplot = None
        site_constraints = None
        prop_explained = None
        species_ids = ['Species1', 'Species2', 'Species3']
        site_ids = ['Site1', 'Site2', 'Site3']
        ca_scores = OrdinationResults(eigvals=eigvals,
                                      species=species,
                                      site=site,
                                      biplot=biplot,
                                      site_constraints=site_constraints,
                                      proportion_explained=prop_explained,
                                      species_ids=species_ids,
                                      site_ids=site_ids)
        # CCA results
        eigvals = np.array([
            0.366135830393, 0.186887643052, 0.0788466514249, 0.082287840501,
            0.0351348475787, 0.0233265839374, 0.0099048981912,
            0.00122461669234, 0.000417454724117
        ])
        species = np.loadtxt(get_data_path('ordres_exp_OrdRes_CCA_species'))
        site = np.loadtxt(get_data_path('ordres_exp_OrdRes_CCA_site'))
        biplot = np.array(
            [[-0.169746767979, 0.63069090084, 0.760769036049],
             [-0.994016563505, 0.0609533148724, -0.0449369418179],
             [0.184352565909, -0.974867543612, 0.0309865007541]])
        site_constraints = np.loadtxt(
            get_data_path('ordres_exp_OrdRes_CCA_site_constraints'))
        prop_explained = None
        species_ids = [
            'Species0', 'Species1', 'Species2', 'Species3', 'Species4',
            'Species5', 'Species6', 'Species7', 'Species8'
        ]
        site_ids = [
            'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6',
            'Site7', 'Site8', 'Site9'
        ]
        cca_scores = OrdinationResults(eigvals=eigvals,
                                       species=species,
                                       site=site,
                                       biplot=biplot,
                                       site_constraints=site_constraints,
                                       proportion_explained=prop_explained,
                                       species_ids=species_ids,
                                       site_ids=site_ids)
        # PCoA results
        eigvals = np.array([
            0.512367260461, 0.300719094427, 0.267912066004, 0.208988681078,
            0.19169895326, 0.16054234528, 0.15017695712, 0.122457748167, 0.0
        ])
        species = None
        site = np.loadtxt(get_data_path('ordres_exp_OrdRes_PCoA_site'))
        biplot = None
        site_constraints = None
        prop_explained = np.array([
            0.267573832777, 0.15704469605, 0.139911863774, 0.109140272454,
            0.100111048503, 0.0838401161912, 0.0784269939011, 0.0639511763509,
            0.0
        ])
        species_ids = None
        site_ids = [
            'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593',
            'PC.355', 'PC.607', 'PC.634'
        ]
        pcoa_scores = OrdinationResults(eigvals=eigvals,
                                        species=species,
                                        site=site,
                                        biplot=biplot,
                                        site_constraints=site_constraints,
                                        proportion_explained=prop_explained,
                                        species_ids=species_ids,
                                        site_ids=site_ids)
        # RDA results
        eigvals = np.array([
            25.8979540892, 14.9825779819, 8.93784077262, 6.13995623072,
            1.68070536498, 0.57735026919, 0.275983624351
        ])
        species = np.loadtxt(get_data_path('ordres_exp_OrdRes_RDA_species'))
        site = np.loadtxt(get_data_path('ordres_exp_OrdRes_RDA_site'))
        biplot = np.array([[0.422650019179, -0.559142585857, -0.713250678211],
                           [0.988495963777, 0.150787422017, -0.0117848614073],
                           [-0.556516618887, 0.817599992718, 0.147714267459],
                           [-0.404079676685, -0.9058434809, -0.127150316558]])
        site_constraints = np.loadtxt(
            get_data_path('ordres_exp_OrdRes_RDA_site_constraints'))
        prop_explained = None
        species_ids = [
            'Species0', 'Species1', 'Species2', 'Species3', 'Species4',
            'Species5'
        ]
        site_ids = [
            'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6',
            'Site7', 'Site8', 'Site9'
        ]
        rda_scores = OrdinationResults(eigvals=eigvals,
                                       species=species,
                                       site=site,
                                       biplot=biplot,
                                       site_constraints=site_constraints,
                                       proportion_explained=prop_explained,
                                       species_ids=species_ids,
                                       site_ids=site_ids)

        self.ordination_results_objs = [
            ca_scores, cca_scores, pcoa_scores, rda_scores
        ]
예제 #17
0
def get_procrustes_results(coords_f1, coords_f2, sample_id_map=None,
                           randomize=None, max_dimensions=None,
                           get_eigenvalues=get_mean_eigenvalues,
                           get_percent_variation_explained=get_mean_percent_variation):
    """ """
    # Parse the PCoA files
    ord_res_1 = OrdinationResults.read(coords_f1)
    ord_res_2 = OrdinationResults.read(coords_f2)

    sample_ids1 = ord_res_1.site_ids
    coords1 = ord_res_1.site
    eigvals1 = ord_res_1.eigvals
    pct_var1 = ord_res_1.proportion_explained

    sample_ids2 = ord_res_2.site_ids
    coords2 = ord_res_2.site
    eigvals2 = ord_res_2.eigvals
    pct_var2 = ord_res_2.proportion_explained

    if sample_id_map:
        sample_ids1 = map_sample_ids(sample_ids1, sample_id_map)
        sample_ids2 = map_sample_ids(sample_ids2, sample_id_map)
    # rearrange the order of coords in coords2 to correspond to
    # the order of coords in coords1
    order = list(set(sample_ids1) & set(sample_ids2))
    coords1 = reorder_coords(coords1, sample_ids1, order)
    coords2 = reorder_coords(coords2, sample_ids2, order)
    if len(order) == 0:
        raise ValueError('No overlapping samples in the two files')

    # If this is a random trial, apply the shuffling function passed as
    # randomize()
    if randomize:
        coords2 = randomize(coords2)
        randomized_coords2 = OrdinationResults(eigvals=eigvals2,
                                               proportion_explained=pct_var2,
                                               site=coords2,
                                               site_ids=order)
    else:
        randomized_coords2 = None

    coords1, coords2 = pad_coords_matrices(coords1, coords2)
    if max_dimensions:
        coords1 = filter_coords_matrix(coords1, max_dimensions)
        coords2 = filter_coords_matrix(coords2, max_dimensions)
        pct_var1 = pct_var1[:max_dimensions]
        pct_var2 = pct_var2[:max_dimensions]
        eigvals1 = eigvals1[:max_dimensions]
        eigvals2 = eigvals2[:max_dimensions]
    else:
        if len(pct_var1) > len(pct_var2):
            pct_var2 = append(pct_var2, zeros(len(pct_var1) - len(pct_var2)))
            eigvals2 = append(eigvals2, zeros(len(eigvals1) - len(eigvals2)))
        elif len(pct_var1) < len(pct_var2):
            pct_var1 = append(pct_var1, zeros(len(pct_var2) - len(pct_var1)))
            eigvals1 = append(eigvals1, zeros(len(eigvals2) - len(eigvals1)))

    # Run the Procrustes analysis
    transformed_coords_m1, transformed_coords_m2, m_squared =\
        procrustes(coords1, coords2)
    # print coords2
    # print transformed_coords_m2

    eigvals = get_eigenvalues(eigvals1, eigvals2)
    pct_var = get_percent_variation_explained(pct_var1, pct_var2)

    transformed_coords1 = OrdinationResults(eigvals=asarray(eigvals),
                                            proportion_explained=asarray(pct_var),
                                            site=asarray(transformed_coords_m1),
                                            site_ids=order)
    transformed_coords2 = OrdinationResults(eigvals=asarray(eigvals),
                                            proportion_explained=asarray(pct_var),
                                            site=asarray(transformed_coords_m2),
                                            site_ids=order)

    # Return the results
    return (transformed_coords1, transformed_coords2,
            m_squared, randomized_coords2)