def test_get_procrustes_results(self): sample_id_map = {"CP3A1": "S1", "CC1A1": "S2", "CC2A1": "S3", "CP1A1": "S4"} actual = get_procrustes_results( StringIO(pcoa1_f), StringIO(pcoa1_f), sample_id_map=sample_id_map, randomize=None, max_dimensions=None ) # just some sanity checks as the individual componenets are # already tested -- these are based on looking at the output of the # run, and testing to ensure that it hasn't changed eigvals = array([8976580.24393, 6044862.67619, 4372581.39431, 3161360.10319, 2583594.45275, 2407555.39787]) prop_expl = array([23.1764657118, 15.6071186064, 11.2894866423, 8.16225689998, 6.67053450426, 6.21602253997]) site = array( [ [-0.199225958574, -0.250846540029, -0.119813087305, -0.155652031006, 0.18495315824, -0.160875399364], [-0.238263544222, -0.37724227779, -0.169458651217, 0.0305157004776, 0.112181007345, 0.0677415967093], [0.116737988534, 0.414627960015, 0.201315243115, 0.113769076804, -0.283025353088, -0.144278863311], [0.320751514262, 0.213460857804, 0.0879564954067, 0.0113672537238, -0.0141088124974, 0.237412665966], ] ) site_ids = ["S3", "S2", "S1", "S4"] expected = OrdinationResults(eigvals=eigvals, proportion_explained=prop_expl, site=site, site_ids=site_ids) assert_almost_equal(actual[0].eigvals, expected.eigvals) assert_almost_equal(actual[0].proportion_explained, expected.proportion_explained) self.assertEqual(actual[0].site_ids, expected.site_ids) assert_almost_equal(actual[0].site, expected.site) assert_almost_equal(actual[1].eigvals, expected.eigvals) assert_almost_equal(actual[1].proportion_explained, expected.proportion_explained) assert_almost_equal(actual[1].site, expected.site) self.assertEqual(actual[1].site_ids, expected.site_ids) self.assertTrue(actual[2] < 6e-30)
def test_get_procrustes_results(self): sample_id_map = { 'CP3A1': 'S1', 'CC1A1': 'S2', 'CC2A1': 'S3', 'CP1A1': 'S4' } actual = get_procrustes_results(StringIO(pcoa1_f), StringIO(pcoa1_f), sample_id_map=sample_id_map, randomize=None, max_dimensions=None) # just some sanity checks as the individual componenets are # already tested -- these are based on looking at the output of the # run, and testing to ensure that it hasn't changed eigvals = array([ 8976580.24393, 6044862.67619, 4372581.39431, 3161360.10319, 2583594.45275, 2407555.39787 ]) prop_expl = array([ 23.1764657118, 15.6071186064, 11.2894866423, 8.16225689998, 6.67053450426, 6.21602253997 ]) site = array([[ -0.199225958574, -0.250846540029, -0.119813087305, -0.155652031006, 0.18495315824, -0.160875399364 ], [ -0.238263544222, -0.37724227779, -0.169458651217, 0.0305157004776, 0.112181007345, 0.0677415967093 ], [ 0.116737988534, 0.414627960015, 0.201315243115, 0.113769076804, -0.283025353088, -0.144278863311 ], [ 0.320751514262, 0.213460857804, 0.0879564954067, 0.0113672537238, -0.0141088124974, 0.237412665966 ]]) site_ids = ['S3', 'S2', 'S1', 'S4'] expected = OrdinationResults(eigvals=eigvals, proportion_explained=prop_expl, site=site, site_ids=site_ids) assert_almost_equal(actual[0].eigvals, expected.eigvals) assert_almost_equal(actual[0].proportion_explained, expected.proportion_explained) self.assertEqual(actual[0].site_ids, expected.site_ids) assert_almost_equal(actual[0].site, expected.site) assert_almost_equal(actual[1].eigvals, expected.eigvals) assert_almost_equal(actual[1].proportion_explained, expected.proportion_explained) assert_almost_equal(actual[1].site, expected.site) self.assertEqual(actual[1].site_ids, expected.site_ids) self.assertTrue(actual[2] < 6e-30)
def test_get_procrustes_results(self): sample_id_map = {'CP3A1':'S1','CC1A1':'S2','CC2A1':'S3','CP1A1':'S4'} actual = get_procrustes_results(self.pcoa1_f,self.pcoa1_f,\ sample_id_map=sample_id_map,randomize=None,max_dimensions=None) # just some sanity checks as the individual componenets are # already tested -- these are based on looking at the output of the # run, and testing to ensure that it hasn't changed self.assertEqual(set(actual[0].split('\n')),set('pc vector number\t1\t2\t3\t4\t5\t6\nS1\t0.116737988534\t0.414627960015\t0.201315243115\t0.113769076804\t-0.283025353088\t-0.144278863311\nS2\t-0.238263544222\t-0.37724227779\t-0.169458651217\t0.0305157004776\t0.112181007345\t0.0677415967093\nS3\t-0.199225958574\t-0.250846540029\t-0.119813087305\t-0.155652031006\t0.18495315824\t-0.160875399364\nS4\t0.320751514262\t0.213460857804\t0.0879564954067\t0.0113672537238\t-0.0141088124974\t0.237412665966\n\n\neigvals\t8976580.24393\t6044862.67619\t4372581.39431\t3161360.10319\t2583594.45275\t2407555.39787\n% variation explained\t23.1764657118\t15.6071186064\t11.2894866423\t8.16225689998\t6.67053450426\t6.21602253997'.split('\n'))) self.assertEqual(set(actual[1].split('\n')),set('pc vector number\t1\t2\t3\t4\t5\t6\nS1\t0.116737988534\t0.414627960015\t0.201315243115\t0.113769076804\t-0.283025353088\t-0.144278863311\nS2\t-0.238263544222\t-0.37724227779\t-0.169458651217\t0.0305157004776\t0.112181007345\t0.0677415967093\nS3\t-0.199225958574\t-0.250846540029\t-0.119813087305\t-0.155652031006\t0.18495315824\t-0.160875399364\nS4\t0.320751514262\t0.213460857804\t0.0879564954067\t0.0113672537238\t-0.0141088124974\t0.237412665966\n\n\neigvals\t8976580.24393\t6044862.67619\t4372581.39431\t3161360.10319\t2583594.45275\t2407555.39787\n% variation explained\t23.1764657118\t15.6071186064\t11.2894866423\t8.16225689998\t6.67053450426\t6.21602253997'.split('\n'))) self.assertTrue(actual[2] < 6e-30)
def test_get_procrustes_results(self): sample_id_map = {'CP3A1':'S1','CC1A1':'S2','CC2A1':'S3','CP1A1':'S4'} actual = get_procrustes_results(self.pcoa1_f,self.pcoa1_f,\ sample_id_map=sample_id_map,randomize=None,max_dimensions=None) # just some sanity checks as the individual componenets are # already tested -- these are based on looking at the output of the # run, and testing to ensure that it hasn't changed self.assertEqual(set(actual[0].split('\n')),set('pc vector number\t1\t2\t3\t4\t5\t6\nS1\t0.116737988534\t0.414627960015\t0.201315243115\t0.113769076804\t-0.283025353088\t-0.144278863311\nS2\t-0.238263544222\t-0.37724227779\t-0.169458651217\t0.0305157004776\t0.112181007345\t0.0677415967093\nS3\t-0.199225958574\t-0.250846540029\t-0.119813087305\t-0.155652031006\t0.18495315824\t-0.160875399364\nS4\t0.320751514262\t0.213460857804\t0.0879564954067\t0.0113672537238\t-0.0141088124974\t0.237412665966\n\n\neigvals\t8976580.24393\t6044862.67619\t4372581.39431\t3161360.10319\t2583594.45275\t2407555.39787\n% variation explained\t23.1764657118\t15.6071186064\t11.2894866423\t8.16225689998\t6.67053450426\t6.21602253997'.split('\n'))) self.assertEqual(set(actual[1].split('\n')),set('pc vector number\t1\t2\t3\t4\t5\t6\nS1\t0.116737988534\t0.414627960015\t0.201315243115\t0.113769076804\t-0.283025353088\t-0.144278863311\nS2\t-0.238263544222\t-0.37724227779\t-0.169458651217\t0.0305157004776\t0.112181007345\t0.0677415967093\nS3\t-0.199225958574\t-0.250846540029\t-0.119813087305\t-0.155652031006\t0.18495315824\t-0.160875399364\nS4\t0.320751514262\t0.213460857804\t0.0879564954067\t0.0113672537238\t-0.0141088124974\t0.237412665966\n\n\neigvals\t8976580.24393\t6044862.67619\t4372581.39431\t3161360.10319\t2583594.45275\t2407555.39787\n% variation explained\t23.1764657118\t15.6071186064\t11.2894866423\t8.16225689998\t6.67053450426\t6.21602253997'.split('\n'))) self.assertTrue(actual[2] < 6e-30)
def test_get_procrustes_results_imprefect_sample_overlap(self): sample_id_map = {'aaa':'S0','bbb':'S1','ccc':'S2','ddd':'S3','eee':'S4'} actual = get_procrustes_results(self.pcoa3_f,self.pcoa4_f,\ sample_id_map=sample_id_map,randomize=None,max_dimensions=None) # Confirm that only the sample ids that are in both procrustes results # show up in the output for a in actual[:2]: self.assertTrue('S1' in a) self.assertTrue('S2' in a) self.assertTrue('S3' in a) self.assertTrue('S0' not in a) self.assertTrue('S4' not in a)
def test_get_procrustes_results_imprefect_sample_overlap(self): sample_id_map = {'aaa':'S0','bbb':'S1','ccc':'S2','ddd':'S3','eee':'S4'} actual = get_procrustes_results(self.pcoa3_f,self.pcoa4_f,\ sample_id_map=sample_id_map,randomize=None,max_dimensions=None) # Confirm that only the sample ids that are in both procrustes results # show up in the output for a in actual[:2]: self.assertTrue('S1' in a) self.assertTrue('S2' in a) self.assertTrue('S3' in a) self.assertTrue('S0' not in a) self.assertTrue('S4' not in a)
def test_get_procrustes_results_imprefect_sample_overlap(self): sample_id_map = {"aaa": "S0", "bbb": "S1", "ccc": "S2", "ddd": "S3", "eee": "S4"} actual = get_procrustes_results( StringIO(pcoa3_f), StringIO(pcoa4_f), sample_id_map=sample_id_map, randomize=None, max_dimensions=None ) # Confirm that only the sample ids that are in both procrustes results # show up in the output for a in actual[:2]: self.assertTrue("S1" in a.site_ids) self.assertTrue("S2" in a.site_ids) self.assertTrue("S3" in a.site_ids) self.assertTrue("S0" not in a.site_ids) self.assertTrue("S4" not in a.site_ids)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fps = opts.input_fps sample_id_map_fps = opts.sample_id_map_fps num_dimensions = opts.num_dimensions max_dims_str = str(num_dimensions or 'alldim') output_dir = opts.output_dir random_trials = opts.random_trials if random_trials is not None and random_trials < 10: option_parser.error( 'Must perform >= 10 trails for Monte Carlo analysis.') if sample_id_map_fps and \ (len(sample_id_map_fps) + 1) != len(opts.input_fps): option_parser.error('If providing sample id maps, there must be ' 'exactly one fewer sample id maps than input ' 'coordinate matrices.') if not exists(output_dir): makedirs(output_dir) reference_input_fp = input_fps[0] reference_input_fp_dir, input_fn1 = split(reference_input_fp) reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1) output_summary_fp = join(output_dir, 'procrustes_results.txt') summary_file_lines = [ '#FP1\tFP2\tNum included dimensions\tMonte Carlo ' 'p-value\tCount better\tM^2', '#Warning: p-values in this file are NOT currently ' 'adjusted for multiple comparisons.' ] for i, query_input_fp in enumerate(input_fps[1:]): query_input_fp_dir, query_input_fn = split(query_input_fp) query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn) output_matrix1_fp = join( output_dir, '%s_transformed_reference.txt' % reference_input_fp_basename) output_matrix2_fp = join( output_dir, '%s_transformed_q%d.txt' % (query_input_fp_basename, i + 1)) if sample_id_map_fps: with open(sample_id_map_fps[i], "U") as f: sample_id_map = dict([ (k, v[0]) for k, v in fields_to_dict(f).iteritems() ]) else: sample_id_map = None with open(reference_input_fp, 'U') as ref_in_f: with open(query_input_fp, 'U') as query_in_f: transf_coords1, transf_coords2, m_squared, rand_coords2 =\ get_procrustes_results(ref_in_f, query_in_f, sample_id_map=sample_id_map, randomize=False, max_dimensions=num_dimensions) transf_coords1.write(output_matrix1_fp) transf_coords2.write(output_matrix2_fp) if random_trials: if opts.store_trial_details: trial_output_dir = join(output_dir, 'trial_details_%d' % i + 2) else: trial_output_dir = None coords_f1 = open(reference_input_fp, 'U') coords_f2 = open(query_input_fp, 'U') actual_m_squared, trial_m_squareds, count_better, mc_p_value =\ procrustes_monte_carlo(coords_f1, coords_f2, trials=random_trials, max_dimensions=num_dimensions, sample_id_map=sample_id_map, trial_output_dir=trial_output_dir) # truncate the p-value to the correct number of significant # digits mc_p_value_str = format_p_value_for_num_iters( mc_p_value, random_trials) summary_file_lines.append( '%s\t%s\t%s\t%s\t%d\t%1.3f' % (reference_input_fp, query_input_fp, max_dims_str, mc_p_value_str, count_better, actual_m_squared)) else: summary_file_lines.append( '%s\t%s\t%s\tNA\tNA\t%1.3f' % (reference_input_fp, query_input_fp, max_dims_str, m_squared)) # Write output summary with open(output_summary_fp, 'w') as f: f.write('\n'.join(summary_file_lines)) f.write('\n')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_fps = opts.input_fps sample_id_map_fps = opts.sample_id_map_fps num_dimensions = opts.num_dimensions max_dims_str = str(num_dimensions or 'alldim') output_dir = opts.output_dir random_trials = opts.random_trials if random_trials != None and random_trials < 10: option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.') if sample_id_map_fps and \ (len(sample_id_map_fps) + 1) != len(opts.input_fps): option_parser.error('If providing sample id maps, there must be exactly' ' one fewer sample id maps than input coordinate' ' matrices.') if not exists(output_dir): makedirs(output_dir) reference_input_fp = input_fps[0] reference_input_fp_dir, input_fn1 = split(reference_input_fp) reference_input_fp_basename, reference_input_fp_ext = splitext(input_fn1) output_summary_fp = join(output_dir,'procrustes_results.txt') summary_file_lines = \ ['#FP1\tFP2\tNum included dimensions\tMonte Carlo p-value\tCount better\tM^2', '#Warning: p-values in this file are NOT currently adjusted for multiple comparisons.'] for i,query_input_fp in enumerate(input_fps[1:]): query_input_fp_dir, query_input_fn = split(query_input_fp) query_input_fp_basename, query_input_fp_ext = splitext(query_input_fn) output_matrix1_fp = join(output_dir, '%s_transformed_reference.txt' % reference_input_fp_basename) output_matrix2_fp = join(output_dir,\ '%s_transformed_q%d.txt' % (query_input_fp_basename, i+1)) if sample_id_map_fps: sample_id_map = dict([(k,v[0]) \ for k,v in fields_to_dict(open(sample_id_map_fps[i], "U")).items()]) else: sample_id_map = None transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\ get_procrustes_results(open(reference_input_fp,'U'),\ open(query_input_fp,'U'),\ sample_id_map=sample_id_map,\ randomize=False, max_dimensions=num_dimensions) output_matrix1_f = open(output_matrix1_fp,'w') output_matrix1_f.write(transformed_coords1) output_matrix1_f.close() output_matrix2_f = open(output_matrix2_fp,'w') output_matrix2_f.write(transformed_coords2) output_matrix2_f.close() if random_trials: if opts.store_trial_details: trial_output_dir = join(output_dir,'trial_details_%d' % i+2) else: trial_output_dir = None coords_f1 = list(open(reference_input_fp,'U')) coords_f2 = list(open(query_input_fp,'U')) actual_m_squared, trial_m_squareds, count_better, mc_p_value =\ procrustes_monte_carlo(coords_f1, coords_f2, trials=random_trials, max_dimensions=num_dimensions, sample_id_map=sample_id_map, trial_output_dir=trial_output_dir) # truncate the p-value to the correct number of significant # digits mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials) summary_file_lines.append('%s\t%s\t%s\t%s\t%d\t%1.3f' %\ (reference_input_fp, query_input_fp, max_dims_str, mc_p_value_str,\ count_better, actual_m_squared)) else: summary_file_lines.append('%s\t%s\t%s\tNA\tNA\t%1.3f' %\ (reference_input_fp, query_input_fp, max_dims_str, m_squared)) # Write output summary f = open(output_summary_fp,'w') f.write('\n'.join(summary_file_lines)) f.write('\n') f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) random_trials = opts.random_trials if random_trials != None and random_trials < 10: option_parser.error('Must perform >= 10 trails for Monte Carlo analysis.') output_dir = opts.output_dir sample_id_map_fp = opts.sample_id_map_fp num_dimensions = opts.num_dimensions if not exists(output_dir): makedirs(output_dir) if opts.store_trial_details: trial_output_dir = '%s/trial_details/' % output_dir else: trial_output_dir = None input_fp1 = opts.input_fps[0] input_fp2 = opts.input_fps[1] input_fp1_dir, input_fn1 = split(input_fp1) input_fp1_basename, input_fp1_ext = splitext(input_fn1) input_fp2_dir, input_fn2 = split(input_fp2) input_fp2_basename, input_fp2_ext = splitext(input_fn2) output_summary_fp = '%s/%s_%s_procrustes_results.txt' %\ (output_dir,input_fp1_basename,input_fp2_basename) output_matrix1_fp = '%s/pc1_transformed.txt' % output_dir output_matrix2_fp = '%s/pc2_transformed.txt' % output_dir if sample_id_map_fp: sample_id_map = dict([(k,v[0]) \ for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()]) else: sample_id_map = None transformed_coords1, transformed_coords2, m_squared, randomized_coords2 =\ get_procrustes_results(open(input_fp1,'U'),\ open(input_fp2,'U'),\ sample_id_map=sample_id_map,\ randomize=False, max_dimensions=num_dimensions) output_matrix1_f = open(output_matrix1_fp,'w') output_matrix1_f.write(transformed_coords1) output_matrix1_f.close() output_matrix2_f = open(output_matrix2_fp,'w') output_matrix2_f.write(transformed_coords2) output_matrix2_f.close() if random_trials: summary_file_lines = ['FP1 FP2 Included_dimensions MC_p_value Count_better M^2'] coords_f1 = list(open(input_fp1,'U')) coords_f2 = list(open(input_fp2,'U')) actual_m_squared, trial_m_squareds, count_better, mc_p_value =\ procrustes_monte_carlo(coords_f1,\ coords_f2,\ trials=random_trials,\ max_dimensions=num_dimensions, sample_id_map=sample_id_map, trial_output_dir=trial_output_dir) # truncate the p-value to the correct number of significant # digits mc_p_value_str = format_p_value_for_num_iters(mc_p_value, random_trials) max_dims_str = str(num_dimensions or 'alldim') summary_file_lines.append('%s %s %s %s %d %1.3f' %\ (input_fp1, input_fp2, str(max_dims_str), mc_p_value_str,\ count_better, actual_m_squared)) f = open(output_summary_fp,'w') f.write('\n'.join(summary_file_lines)) f.write('\n') f.close()