def test_from_file_error(self): for test_path in self.fferror_test_paths: with open(get_data_path(test_path), 'U') as f: with npt.assert_raises(FileFormatError): OrdinationResults.from_file(f) for test_path in self.verror_test_paths: with open(get_data_path(test_path), 'U') as f: with npt.assert_raises(ValueError): OrdinationResults.from_file(f)
def test_from_file(self): for exp_scores, test_path in zip(self.scores, self.test_paths): for file_type in ('file like', 'file name'): fname = get_data_path(test_path) if file_type == 'file like': with open(fname) as fh: obs = OrdinationResults.from_file(fh) elif file_type == 'file name': obs = OrdinationResults.from_file(fname) yield self.check_OrdinationResults_equal, obs, exp_scores
def test_get_procrustes_results(self): sample_id_map = { 'CP3A1': 'S1', 'CC1A1': 'S2', 'CC2A1': 'S3', 'CP1A1': 'S4' } actual = get_procrustes_results(self.pcoa1_f, self.pcoa1_f, sample_id_map=sample_id_map, randomize=None, max_dimensions=None) # just some sanity checks as the individual componenets are # already tested -- these are based on looking at the output of the # run, and testing to ensure that it hasn't changed eigvals = array([ 8976580.24393, 6044862.67619, 4372581.39431, 3161360.10319, 2583594.45275, 2407555.39787 ]) prop_expl = array([ 23.1764657118, 15.6071186064, 11.2894866423, 8.16225689998, 6.67053450426, 6.21602253997 ]) site = array([[ -0.199225958574, -0.250846540029, -0.119813087305, -0.155652031006, 0.18495315824, -0.160875399364 ], [ -0.238263544222, -0.37724227779, -0.169458651217, 0.0305157004776, 0.112181007345, 0.0677415967093 ], [ 0.116737988534, 0.414627960015, 0.201315243115, 0.113769076804, -0.283025353088, -0.144278863311 ], [ 0.320751514262, 0.213460857804, 0.0879564954067, 0.0113672537238, -0.0141088124974, 0.237412665966 ]]) site_ids = ['S3', 'S2', 'S1', 'S4'] expected = OrdinationResults(eigvals=eigvals, proportion_explained=prop_expl, site=site, site_ids=site_ids) assert_almost_equal(actual[0].eigvals, expected.eigvals) assert_almost_equal(actual[0].proportion_explained, expected.proportion_explained) self.assertEqual(actual[0].site_ids, expected.site_ids) assert_almost_equal(actual[0].site, expected.site) assert_almost_equal(actual[1].eigvals, expected.eigvals) assert_almost_equal(actual[1].proportion_explained, expected.proportion_explained) assert_almost_equal(actual[1].site, expected.site) self.assertEqual(actual[1].site_ids, expected.site_ids) self.assertTrue(actual[2] < 6e-30)
def setUp(self): or_f = StringIO(PCOA_STRING) self.ord_res = OrdinationResults.from_file(or_f) self.data = [['PC.354', 'Control', '20061218', 'Ctrol_mouse_I.D._354'], ['PC.355', 'Control', '20061218', 'Control_mouse_I.D._355'], ['PC.356', 'Control', '20061126', 'Control_mouse_I.D._356'], ['PC.481', 'Control', '20070314', 'Control_mouse_I.D._481'], ['PC.593', 'Control', '20071210', 'Control_mouse_I.D._593'], ['PC.607', 'Fast', '20071112', 'Fasting_mouse_I.D._607'], ['PC.634', 'Fast', '20080116', 'Fasting_mouse_I.D._634'], ['PC.635', 'Fast', '20080116', 'Fasting_mouse_I.D._635'], ['PC.636', 'Fast', '20080116', 'Fasting_mouse_I.D._636']] self.headers = ['SampleID', 'Treatment', 'DOB', 'Description']
def parse_coords(lines): """Parse skbio's ordination results file into coords, labels, eigvals, pct_explained. Returns: - list of sample labels in order - array of coords (rows = samples, cols = axes in descending order) - list of eigenvalues - list of percent variance explained For the file format check skbio.math.stats.ordination.OrdinationResults.from_file Strategy: read the file using skbio's parser and return the objects we want """ pcoa_results = OrdinationResults.from_file(lines) return (pcoa_results.site_ids, pcoa_results.site, pcoa_results.eigvals, pcoa_results.proportion_explained)
if __name__ == '__main__': option_parser, opts, args = parse_command_line_parameters(**script_info) ord_fp = opts.input_fp mapping_fp = opts.map_fp categories = opts.categories.split(',') output_dir = opts.output_dir sort_by = opts.sort_by algorithm = opts.algorithm axes = opts.axes weighted = opts.weight_by_vector window_size = opts.window_size # Parse the ordination results with open(ord_fp, 'U') as f: ord_res = OrdinationResults.from_file(f) # Parse the mapping file with open(mapping_fp, 'U') as f: map_dict = parse_mapping_file_to_dict(f)[0] metamap = pd.DataFrame.from_dict(map_dict, orient='index') for category in categories: if category not in metamap.keys(): option_parser.error("Category %s does not exist in the mapping " "file" % categories) sort_category = None if sort_by: if sort_by == 'SampleID': sort_category = None
def setup_class(cls): # CA results eigvals = np.array([0.0961330159181, 0.0409418140138]) species = np.array([[0.408869425742, 0.0695518116298], [-0.1153860437, -0.299767683538], [-0.309967102571, 0.187391917117]]) site = np.array([[-0.848956053187, 0.882764759014], [-0.220458650578, -1.34482000302], [1.66697179591, 0.470324389808]]) biplot = None site_constraints = None prop_explained = None species_ids = ['Species1', 'Species2', 'Species3'] site_ids = ['Site1', 'Site2', 'Site3'] ca_scores = OrdinationResults(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids) # CCA results eigvals = np.array([ 0.366135830393, 0.186887643052, 0.0788466514249, 0.082287840501, 0.0351348475787, 0.0233265839374, 0.0099048981912, 0.00122461669234, 0.000417454724117 ]) species = np.loadtxt(get_data_path('exp_OrdRes_CCA_species')) site = np.loadtxt(get_data_path('exp_OrdRes_CCA_site')) biplot = np.array( [[-0.169746767979, 0.63069090084, 0.760769036049], [-0.994016563505, 0.0609533148724, -0.0449369418179], [0.184352565909, -0.974867543612, 0.0309865007541]]) site_constraints = np.loadtxt( get_data_path('exp_OrdRes_CCA_site_constraints')) prop_explained = None species_ids = [ 'Species0', 'Species1', 'Species2', 'Species3', 'Species4', 'Species5', 'Species6', 'Species7', 'Species8' ] site_ids = [ 'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6', 'Site7', 'Site8', 'Site9' ] cca_scores = OrdinationResults(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids) # PCoA results eigvals = np.array([ 0.512367260461, 0.300719094427, 0.267912066004, 0.208988681078, 0.19169895326, 0.16054234528, 0.15017695712, 0.122457748167, 0.0 ]) species = None site = np.loadtxt(get_data_path('exp_OrdRes_PCoA_site')) biplot = None site_constraints = None prop_explained = np.array([ 0.267573832777, 0.15704469605, 0.139911863774, 0.109140272454, 0.100111048503, 0.0838401161912, 0.0784269939011, 0.0639511763509, 0.0 ]) species_ids = None site_ids = [ 'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634' ] pcoa_scores = OrdinationResults(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids) # RDA results eigvals = np.array([ 25.8979540892, 14.9825779819, 8.93784077262, 6.13995623072, 1.68070536498, 0.57735026919, 0.275983624351 ]) species = np.loadtxt(get_data_path('exp_OrdRes_RDA_species')) site = np.loadtxt(get_data_path('exp_OrdRes_RDA_site')) biplot = np.array([[0.422650019179, -0.559142585857, -0.713250678211], [0.988495963777, 0.150787422017, -0.0117848614073], [-0.556516618887, 0.817599992718, 0.147714267459], [-0.404079676685, -0.9058434809, -0.127150316558]]) site_constraints = np.loadtxt( get_data_path('exp_OrdRes_RDA_site_constraints')) prop_explained = None species_ids = [ 'Species0', 'Species1', 'Species2', 'Species3', 'Species4', 'Species5' ] site_ids = [ 'Site0', 'Site1', 'Site2', 'Site3', 'Site4', 'Site5', 'Site6', 'Site7', 'Site8', 'Site9' ] rda_scores = OrdinationResults(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=site_constraints, proportion_explained=prop_explained, species_ids=species_ids, site_ids=site_ids) cls.scores = [ca_scores, cca_scores, pcoa_scores, rda_scores] cls.test_paths = [ 'L&L_CA_data_scores', 'example3_scores', 'PCoA_sample_data_3_scores', 'example2_scores' ] cls.fferror_test_paths = [ 'error1', 'error2', 'error3', 'error4', 'error5', 'error6' ] cls.verror_test_paths = [ 'v_error1', 'v_error2', 'v_error3', 'v_error4', 'v_error5', 'v_error6', 'v_error7', 'v_error8', 'v_error9', 'v_error10' ]
def get_procrustes_results(coords_f1, coords_f2, sample_id_map=None, randomize=None, max_dimensions=None, get_eigenvalues=get_mean_eigenvalues, get_percent_variation_explained=get_mean_percent_variation): """ """ # Parse the PCoA files ord_res_1 = OrdinationResults.from_file(coords_f1) ord_res_2 = OrdinationResults.from_file(coords_f2) sample_ids1 = ord_res_1.site_ids coords1 = ord_res_1.site eigvals1 = ord_res_1.eigvals pct_var1 = ord_res_1.proportion_explained sample_ids2 = ord_res_2.site_ids coords2 = ord_res_2.site eigvals2 = ord_res_2.eigvals pct_var2 = ord_res_2.proportion_explained if sample_id_map: sample_ids1 = map_sample_ids(sample_ids1, sample_id_map) sample_ids2 = map_sample_ids(sample_ids2, sample_id_map) # rearrange the order of coords in coords2 to correspond to # the order of coords in coords1 order = list(set(sample_ids1) & set(sample_ids2)) coords1 = reorder_coords(coords1, sample_ids1, order) coords2 = reorder_coords(coords2, sample_ids2, order) if len(order) == 0: raise ValueError('No overlapping samples in the two files') # If this is a random trial, apply the shuffling function passed as # randomize() if randomize: coords2 = randomize(coords2) randomized_coords2 = OrdinationResults(eigvals=eigvals2, proportion_explained=pct_var2, site=coords2, site_ids=order) else: randomized_coords2 = None coords1, coords2 = pad_coords_matrices(coords1, coords2) if max_dimensions: coords1 = filter_coords_matrix(coords1, max_dimensions) coords2 = filter_coords_matrix(coords2, max_dimensions) pct_var1 = pct_var1[:max_dimensions] pct_var2 = pct_var2[:max_dimensions] eigvals1 = eigvals1[:max_dimensions] eigvals2 = eigvals2[:max_dimensions] else: if len(pct_var1) > len(pct_var2): pct_var2 = append(pct_var2, zeros(len(pct_var1) - len(pct_var2))) eigvals2 = append(eigvals2, zeros(len(eigvals1) - len(eigvals2))) elif len(pct_var1) < len(pct_var2): pct_var1 = append(pct_var1, zeros(len(pct_var2) - len(pct_var1))) eigvals1 = append(eigvals1, zeros(len(eigvals2) - len(eigvals1))) # Run the Procrustes analysis transformed_coords_m1, transformed_coords_m2, m_squared =\ procrustes(coords1, coords2) # print coords2 # print transformed_coords_m2 eigvals = get_eigenvalues(eigvals1, eigvals2) pct_var = get_percent_variation_explained(pct_var1, pct_var2) transformed_coords1 = OrdinationResults(eigvals=asarray(eigvals), proportion_explained=asarray(pct_var), site=asarray(transformed_coords_m1), site_ids=order) transformed_coords2 = OrdinationResults(eigvals=asarray(eigvals), proportion_explained=asarray(pct_var), site=asarray(transformed_coords_m2), site_ids=order) # Return the results return (transformed_coords1, transformed_coords2, m_squared, randomized_coords2)
def setUp(self): eigvals = np.array([ 0.512367260461, 0.300719094427, 0.267912066004, 0.208988681078, 0.19169895326, 0.16054234528, 0.15017695712, 0.122457748167, 0.0 ]) site = np.array( [[ -0.212230626531, 0.216034194368, 0.03532727349, -0.254450494129, -0.0687468542543, 0.231895596562, 0.00496549154314, -0.0026246871695, 9.73837390723e-10 ], [ -0.277487312135, -0.0295483215975, -0.0744173437992, 0.0957182357964, 0.204714844022, -0.0055407341857, -0.190287966833, 0.16307126638, 9.73837390723e-10 ], [ 0.220886492631, 0.0874848360559, -0.351990132198, -0.00316535032886, 0.114635191853, -0.00019194106125, 0.188557853937, 0.030002427212, 9.73837390723e-10 ], [ 0.0308923744062, -0.0446295973489, 0.133996451689, 0.29318228566, -0.167812539312, 0.130996149793, 0.113551017379, 0.109987942454, 9.73837390723e-10 ], [ 0.27616778138, -0.0341866951102, 0.0633000238256, 0.100446653327, 0.123802521199, 0.1285839664, -0.132852841046, -0.217514322505, 9.73837390723e-10 ], [ 0.202458130052, -0.115216120518, 0.301820871723, -0.18300251046, 0.136208248567, -0.0989435556722, 0.0927738484879, 0.0909429797672, 9.73837390723e-10 ], [ 0.236467470907, 0.21863434374, -0.0301637746424, -0.0225473129718, -0.205287183891, -0.180224615141, -0.165277751908, 0.0411933458557, 9.73837390723e-10 ], [ -0.105517545144, -0.41405687433, -0.150073017617, -0.116066751485, -0.158763393475, -0.0223918378516, -0.0263068046112, -0.0501209518091, 9.73837390723e-10 ], [ -0.371636765565, 0.115484234741, 0.0721996475289, 0.0898852445906, 0.0212491652909, -0.184183028843, 0.114877153051, -0.164938000185, 9.73837390723e-10 ]]) prop_expl = np.array([ 25.6216900347, 15.7715955926, 14.1215046787, 11.6913885817, 9.83044890697, 8.51253468595, 7.88775505332, 6.56308246609, 4.42499350906e-16 ]) site_ids = [ 'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634' ] self.ord_res = OrdinationResults(eigvals=eigvals, site=site, proportion_explained=prop_expl, site_ids=site_ids) metadata_map = { 'PC.354': { 'Treatment': 'Control', 'DOB': '20061218', 'Weight': '60', 'Description': 'Control_mouse_I.D._354' }, 'PC.355': { 'Treatment': 'Control', 'DOB': '20061218', 'Weight': '55', 'Description': 'Control_mouse_I.D._355' }, 'PC.356': { 'Treatment': 'Control', 'DOB': '20061126', 'Weight': '50', 'Description': 'Control_mouse_I.D._356' }, 'PC.481': { 'Treatment': 'Control', 'DOB': '20070314', 'Weight': '52', 'Description': 'Control_mouse_I.D._481' }, 'PC.593': { 'Treatment': 'Control', 'DOB': '20071210', 'Weight': '57', 'Description': 'Control_mouse_I.D._593' }, 'PC.607': { 'Treatment': 'Fast', 'DOB': '20071112', 'Weight': '65', 'Description': 'Fasting_mouse_I.D._607' }, 'PC.634': { 'Treatment': 'Fast', 'DOB': '20080116', 'Weight': '68', 'Description': 'Fasting_mouse_I.D._634' }, 'PC.635': { 'Treatment': 'Fast', 'DOB': '20080116', 'Weight': '70', 'Description': 'Fasting_mouse_I.D._635' }, 'PC.636': { 'Treatment': 'Fast', 'DOB': '20080116', 'Weight': '72', 'Description': 'Fasting_mouse_I.D._636' } } self.metadata_map = pd.DataFrame.from_dict(metadata_map, orient='index') self.categories = ['Treatment'] self.sort_by = 'Weight'
def get_procrustes_results( coords_f1, coords_f2, sample_id_map=None, randomize=None, max_dimensions=None, get_eigenvalues=get_mean_eigenvalues, get_percent_variation_explained=get_mean_percent_variation): """ """ # Parse the PCoA files ord_res_1 = OrdinationResults.from_file(coords_f1) ord_res_2 = OrdinationResults.from_file(coords_f2) sample_ids1 = ord_res_1.site_ids coords1 = ord_res_1.site eigvals1 = ord_res_1.eigvals pct_var1 = ord_res_1.proportion_explained sample_ids2 = ord_res_2.site_ids coords2 = ord_res_2.site eigvals2 = ord_res_2.eigvals pct_var2 = ord_res_2.proportion_explained if sample_id_map: sample_ids1 = map_sample_ids(sample_ids1, sample_id_map) sample_ids2 = map_sample_ids(sample_ids2, sample_id_map) # rearrange the order of coords in coords2 to correspond to # the order of coords in coords1 order = list(set(sample_ids1) & set(sample_ids2)) coords1 = reorder_coords(coords1, sample_ids1, order) coords2 = reorder_coords(coords2, sample_ids2, order) if len(order) == 0: raise ValueError('No overlapping samples in the two files') # If this is a random trial, apply the shuffling function passed as # randomize() if randomize: coords2 = randomize(coords2) randomized_coords2 = OrdinationResults(eigvals=eigvals2, proportion_explained=pct_var2, site=coords2, site_ids=order) else: randomized_coords2 = None coords1, coords2 = pad_coords_matrices(coords1, coords2) if max_dimensions: coords1 = filter_coords_matrix(coords1, max_dimensions) coords2 = filter_coords_matrix(coords2, max_dimensions) pct_var1 = pct_var1[:max_dimensions] pct_var2 = pct_var2[:max_dimensions] eigvals1 = eigvals1[:max_dimensions] eigvals2 = eigvals2[:max_dimensions] else: if len(pct_var1) > len(pct_var2): pct_var2 = append(pct_var2, zeros(len(pct_var1) - len(pct_var2))) eigvals2 = append(eigvals2, zeros(len(eigvals1) - len(eigvals2))) elif len(pct_var1) < len(pct_var2): pct_var1 = append(pct_var1, zeros(len(pct_var2) - len(pct_var1))) eigvals1 = append(eigvals1, zeros(len(eigvals2) - len(eigvals1))) # Run the Procrustes analysis transformed_coords_m1, transformed_coords_m2, m_squared =\ procrustes(coords1, coords2) # print coords2 # print transformed_coords_m2 eigvals = get_eigenvalues(eigvals1, eigvals2) pct_var = get_percent_variation_explained(pct_var1, pct_var2) transformed_coords1 = OrdinationResults( eigvals=asarray(eigvals), proportion_explained=asarray(pct_var), site=asarray(transformed_coords_m1), site_ids=order) transformed_coords2 = OrdinationResults( eigvals=asarray(eigvals), proportion_explained=asarray(pct_var), site=asarray(transformed_coords_m2), site_ids=order) # Return the results return (transformed_coords1, transformed_coords2, m_squared, randomized_coords2)