def main(): """Main method for script.""" parser = argparse.ArgumentParser() parser.add_argument('in_tree_filename', type=str, help='Path to initial tree') parser.add_argument('in_tree_schema', type=str, choices=['nexus', 'newick'], help='The input tree schema') parser.add_argument('out_tree_filename', type=str, help='Ouput tree path') parser.add_argument('out_tree_schema', type=str, choices=['nexus', 'newick'], help='The output tree schema') parser.add_argument('accepted_taxa_filename', type=str, help='File path to write out accepted taxon names') args = parser.parse_args() tree = TreeWrapper.get(path=args.in_tree_filename, schema=args.in_tree_schema) out_tree, accepted_taxa = get_and_replace_names(tree) # Write tree out_tree.write(path=args.out_tree_filename, schema=args.out_tree_schema) # Write accepted taxa with open(args.accepted_taxa_filename, 'w') as taxa_out_file: for taxon_name in accepted_taxa: taxa_out_file.write(taxon_name)
def test_valid(self, valid_phylo_beta_diversity_package): """Test the method with valid data. Args: valid_phylo_beta_diversity_package (tuple): A tuple of information that together forms a valid phylogenetic beta diversity package. Note: * Test values were determined from example at https://rdrr.io/rforge/betapart/man/phylo.beta.pair.html """ (pam_fn, tree_fn, _, _, _, test_beta_sim_fn, test_beta_sne_fn, test_beta_sor_fn, _, _, _, test_phylo_beta_sim_fn, test_phylo_beta_sne_fn, test_phylo_beta_sor_fn) = valid_phylo_beta_diversity_package with open(pam_fn) as in_f: pam = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) tree = TreeWrapper.from_filename(tree_fn) with open(test_beta_sim_fn) as in_f: test_beta_sim = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_beta_sne_fn) as in_f: test_beta_sne = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_beta_sor_fn) as in_f: test_beta_sor = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_phylo_beta_sim_fn) as in_f: test_phylo_beta_sim = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_phylo_beta_sne_fn) as in_f: test_phylo_beta_sne = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_phylo_beta_sor_fn) as in_f: test_phylo_beta_sor = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) (beta_sim, phylo_beta_sim, beta_sne, phylo_beta_sne, beta_sor, phylo_beta_sor) = pbd.calculate_phylo_beta_diversity_sorensen( pam, tree) # Check matrix outputs to see if they are within tolerance assert np.allclose(beta_sim, test_beta_sim) assert np.allclose(phylo_beta_sim, test_phylo_beta_sim) assert np.allclose(beta_sne, test_beta_sne) assert np.allclose(phylo_beta_sne, test_phylo_beta_sne) assert np.allclose(beta_sor, test_beta_sor) assert np.allclose(phylo_beta_sor, test_phylo_beta_sor)
def test_valid(self): """Test the function with valid inputs.""" # Create a tree tree = TreeWrapper.get(data='(A,(B,((C,D),(E,F))));', schema='newick') mtx = Matrix(np.random.random((6, 2, 1)), headers={ '0': ['A', 'B', 'C', 'D', 'E', 'F'], '1': ['label', 'other_val'] }) # This should not fail annotators.annotate_tree_with_label(tree, mtx, label_column=0)
def main(): pam_fn = 'C:/Users/cj/Desktop/ryan_v3/pam.lmm' tree_fn = 'C:/Users/cj/Desktop/ryan_v3/squid_tree.nex' out_fn = 'C:/Users/cj/Desktop/ryan_v3/tree_mtx.lmm' with open(pam_fn, 'rb') as in_file: pam = Matrix.load_flo(in_file) tree = TreeWrapper.get(path=tree_fn, schema='nexus') tree_mtx = calculate_tree_site_statistics(pam, tree) with open(out_fn, 'wb') as out_file: tree_mtx.save(out_file) print(tree_mtx.max(axis=1)) print(tree_mtx.max(axis=0))
def test_valid(self, valid_phylo_beta_diversity_package): """Test the method with valid data Note: * Test values were determined from example at https://rdrr.io/rforge/betapart/man/phylo.beta.pair.html """ (pam_fn, tree_fn, test_beta_jac_fn, test_beta_jne_fn, test_beta_jtu_fn, _, _, _, test_phylo_beta_jac_fn, test_phylo_beta_jne_fn, test_phylo_beta_jtu_fn, _, _, _) = valid_phylo_beta_diversity_package with open(pam_fn) as in_f: pam = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) tree = TreeWrapper.from_filename(tree_fn) with open(test_beta_jac_fn) as in_f: test_beta_jac = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_beta_jne_fn) as in_f: test_beta_jne = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_beta_jtu_fn) as in_f: test_beta_jtu = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_phylo_beta_jac_fn) as in_f: test_phylo_beta_jac = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_phylo_beta_jne_fn) as in_f: test_phylo_beta_jne = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) with open(test_phylo_beta_jtu_fn) as in_f: test_phylo_beta_jtu = Matrix.load_csv(in_f, num_header_rows=1, num_header_cols=1) (beta_jtu, phylo_beta_jtu, beta_jne, phylo_beta_jne, beta_jac, phylo_beta_jac) = pbd.calculate_phylo_beta_diversity_jaccard( pam, tree) # Check matrix outputs to see if they are within tolerance assert np.allclose(beta_jtu, test_beta_jtu) assert np.allclose(phylo_beta_jtu, test_phylo_beta_jtu) assert np.allclose(beta_jne, test_beta_jne) assert np.allclose(phylo_beta_jne, test_phylo_beta_jne) assert np.allclose(beta_jac, test_beta_jac) assert np.allclose(phylo_beta_jac, test_phylo_beta_jac)
def purge_tree(tree_filename, tree_schema, occurrence_filename, species_col): """Get a tree and purge taxa not in occurrence data.""" tree = TreeWrapper.get(path=tree_filename, schema=tree_schema) species = set([]) with open(occurrence_filename, 'r') as in_file: for line in in_file: parts = line.split(', ') sp_name = parts[species_col].strip() species.add(sp_name) purge_taxa = [] for taxon in tree.taxon_namespace: if not taxon.label in species: purge_taxa.append(taxon) tree.prune_taxa(purge_taxa) tree.purge_taxon_namespace() return tree
def test_valid(self, tmpdir): """Test the function with valid inputs. Args: tmpdir (:obj:`py.path.local`): A temporary directory test fixture generated by pytest. """ # Create a tree tree = TreeWrapper.get(data='(A,(B,((C,D),(E,F))));', schema='newick') mtx = Matrix( np.random.random((6, 3, 2)), headers={'0': ['A', 'B', 'C', 'D', 'E', 'F'], '1': ['label', 'other_val', 'one_more_val']}) # This should not fail output_directory = os.path.join(tmpdir.dirname, 'plots') create_distribution_plots(tree, mtx, output_directory)
def get_squidded_tree(tree_fn, tree_schema, squid_json): tree = TreeWrapper.get(path=tree_fn, schema=tree_schema) json_data_lines = [] with open(squid_json) as in_file: first_line = True for line in in_file: if first_line: first_line = False else: json_data_lines.append(line) squid_list_json = json.loads(''.join(json_data_lines)) squid_dict = { i['scientific_name'].replace('_', ' '): i['header'] for i in squid_list_json } tree.annotate_tree_tips('squid', squid_dict) return tree
def main(): """Main method for script.""" parser = argparse.ArgumentParser() parser.add_argument('shapegrid_filename', type=str, help='File location of the shapegrid shapefile') parser.add_argument('pam_filename', type=str, help='File location of the PAM matrix for statistics') parser.add_argument('tree_filename', type=str, help='File location of the tree to use for statistics') parser.add_argument('tree_schema', choices=['newick', 'nexus'], help='The tree schema') parser.add_argument('out_geojson_filename', type=str, help='File location to write the output GeoJSON') parser.add_argument('--layer', nargs=2, action='append', help='File location of a layer followed by a label') args = parser.parse_args() # Load data pam = Matrix.load(args.pam_filename) tree = TreeWrapper.get(path=args.tree_filename, schema=args.tree_schema) # Encode layers encoded_layers = encode_environment_layers(args.shapegrid_filename, args.layer) # Calculate PAM statistics stats_mtx = calculate_tree_site_statistics(pam, tree) # Join encoded layers and PAM statistics mtx = join_encoded_layers_and_pam_stats(encoded_layers, stats_mtx) # Generate GeoJSON geojson_data = create_geojson(args.shapegrid_filename, mtx) # Write GeoJSON with open(args.out_geojson_filename, 'w') as out_file: json.dump(geojson_data, out_file)
def main(): parser = argparse.ArgumentParser() parser.add_argument('in_tree_filename', type=str, help='The file location of the input tree') parser.add_argument('in_tree_schema', type=str, choices=['newick', 'nexus'], help='The schema of the input tree') parser.add_argument('out_tree_filename', type=str, help='The file location of the output tree') parser.add_argument('out_tree_schema', type=str, choices=['newick', 'nexus'], help='The schema of the output tree') args = parser.parse_args() tree = TreeWrapper.get(path=args.in_tree_filename, schema=args.in_tree_schema) out_tree = get_and_replace_names(tree) out_tree.write(path=args.out_tree_filename, schema=args.out_tree_schema)
'-c', '--out_csv_filename', type=str, help='If provided, write the output character matrix CSV ' 'to this file location') args = parser.parse_args() # Check that input files exist if not os.path.exists(args.in_tree_filename): raise IOError( 'Input tree {} does not exist'.format(args.in_tree_filename)) if not os.path.exists(args.data_filename): raise IOError( 'Input data file {} does not exist'.format(args.data_filename)) # Read the tree tree = TreeWrapper.get( path=args.in_tree_filename, schema=args.in_tree_schema) # Read data if args.data_format == 'csv': with open(args.data_filename) as in_file: sequences, headers = data_readers.read_csv_alignment_flo( in_file) elif args.data_format == 'json': with open(args.data_filename) as in_file: sequences, headers = data_readers.read_json_alginment_flo( inf_file) elif args.data_format == 'phylip': with open(args.data_filename) as in_file: sequences = data_reders.read_phylip_alignment_flo(in_file) headers = None elif args.data_format == 'table':
def calculate_continuous_ancestral_states(tree, char_mtx, sum_to_one=False, calc_std_err=False): """Calculates the continuous ancestral states for the nodes in a tree. Args: tree (Tree): A dendropy tree or TreeWrapper object. char_mtx (Matrix): A Matrix object with character information. Each row should represent a tip in the tree and each column should be a variable to calculate ancestral state for. calc_std_err (:obj:`bool`, optional): If True, calculate standard error for each variable. Defaults to False. sum_to_one (:obj:`bool`, optional): If True, standardize the character matrix so that the values in a row sum to one. Defaults to False. Returns: A matrix of character data with the following dimensions: * rows: nodes / tips in the tree * columns: character variables * depth: first is the calculated value, second layer is standard error if desired Todo: * Add function for consistent label handling. """ # Wrap tree if dendropy tree if not isinstance(tree, TreeWrapper): tree = TreeWrapper.from_base_tree(tree) # Assign labels to nodes that don't have them tree.add_node_labels() # Synchronize tree and character data # Prune tree prune_taxa = [] keep_taxon_labels = [] init_row_headers = char_mtx.get_row_headers() for taxon in tree.taxon_namespace: label = taxon.label.replace(' ', '_') if label not in init_row_headers: prune_taxa.append(taxon) print( 'Could not find {} in character matrix, pruning'.format(label)) else: keep_taxon_labels.append(label) if len(keep_taxon_labels) == 0: raise Exception( 'None of the tree tips were found in the character data') tree.prune_taxa(prune_taxa) tree.purge_taxon_namespace() # Prune character data keep_rows = [] i = 0 for label in init_row_headers: if label in keep_taxon_labels: keep_rows.append(i) else: print('Could not find {} in tree tips, pruning'.format(label)) i += 1 char_mtx = char_mtx.slice(keep_rows) # Standardize character matrix if requested tip_count, num_vars = char_mtx.shape if sum_to_one: for i in range(tip_count): sc = float(1.0) / np.sum(char_mtx[i]) for j in range(num_vars): char_mtx[i, j] *= sc # Initialize data matrix num_nodes = len(tree.nodes()) data_shape = (num_nodes, num_vars, 2 if calc_std_err else 1) data = np.zeros(data_shape, dtype=float) # Initialize headers row_headers = [] tip_col_headers = char_mtx.get_column_headers() tip_row_headers = char_mtx.get_row_headers() tip_lookup = dict([(tip_row_headers[i].replace('_', ' '), i) for i in range(tip_count)]) # Get the number of internal nodes in the tree internal_node_count = num_nodes - tip_count # Loop through the tree and set the matrix index for each node # Also set data values node_headers = [] node_i = tip_count tip_i = 0 node_index_lookup = {} for node in tree.nodes(): label = _get_node_label(node) if len(node.child_nodes()) == 0: # Tip node_index_lookup[label] = tip_i row_headers.append(label) data[tip_i, :, 0] = char_mtx[tip_lookup[label]] tip_i += 1 else: node_index_lookup[label] = node_i node_headers.append(label) # Internal node data[node_i, :, 0] = np.zeros((1, num_vars), dtype=float) node_i += 1 # Row headers should be extended with node headers row_headers.extend(node_headers) # For each variable for x in range(num_vars): # Compute the ML estimate of the root full_mcp = np.zeros((internal_node_count, internal_node_count), dtype=float) full_vcp = np.zeros(internal_node_count, dtype=float) for k in tree.postorder_edge_iter(): i = k.head_node if len(i.child_nodes()) != 0: node_num_i = node_index_lookup[_get_node_label(i)] - tip_count for j in i.child_nodes(): tbl = 2. / j.edge_length full_mcp[node_num_i][node_num_i] += tbl node_num_j = node_index_lookup[_get_node_label(j)] if len(j.child_nodes()) == 0: full_vcp[node_num_i] += (data[node_num_j, x, 0] * tbl) else: node_num_j -= tip_count full_mcp[node_num_i][node_num_j] -= tbl full_mcp[node_num_j][node_num_i] -= tbl full_mcp[node_num_j][node_num_j] += tbl b = la.cho_factor(full_mcp) # these are the ML estimates for the ancestral states ml_est = la.cho_solve(b, full_vcp) sos = 0 for k in tree.postorder_edge_iter(): i = k.head_node node_num_i = node_index_lookup[_get_node_label(i)] if len(i.child_nodes()) != 0: data[node_num_i, x, 0] = ml_est[node_num_i - tip_count] if calc_std_err: for j in i.child_nodes(): node_num_j = node_index_lookup[_get_node_label(j)] temp = data[node_num_i, x, 0] - data[node_num_j, x, 0] sos += temp * temp / j.edge_length # nni is node_num_i adjusted for only nodes nni = node_num_i - tip_count qpq = full_mcp[nni][nni] tm1 = np.delete(full_mcp, (nni), axis=0) tm = np.delete(tm1, (nni), axis=1) b = la.cho_factor(tm) sol = la.cho_solve(b, tm1[:, nni]) temp_std_err = qpq - np.inner(tm1[:, nni], sol) data[node_num_i, x, 1] = math.sqrt( 2.0 * sos / ((internal_node_count - 1) * temp_std_err)) depth_headers = ['maximum_likelihood'] if calc_std_err: depth_headers.append('standard_error') mtx_headers = {'0': row_headers, '1': tip_col_headers, '2': depth_headers} return tree, Matrix(data, headers=mtx_headers)
def main(): """Main method for script.""" parser = argparse.ArgumentParser() parser.add_argument('--out_stats_matrix_filename', type=str, help='Location to write statistics matrix.') parser.add_argument('shapegrid_filename', type=str, help='File location of the shapegrid shapefile') parser.add_argument('pam_filename', type=str, help='File location of the PAM matrix for statistics') parser.add_argument('tree_filename', type=str, help='File location of the tree to use for statistics') parser.add_argument('tree_schema', choices=['newick', 'nexus'], help='The tree schema') parser.add_argument('out_geojson_filename', type=str, help='File location to write the output GeoJSON') parser.add_argument('out_csv_filename', type=str, help='File location to write the output CSV') parser.add_argument('out_matrix_filename', type=str, help='File location to write the output matrix') parser.add_argument('--layer', nargs=2, action='append', help='File location of a layer followed by a label') args = parser.parse_args() # Load data pam = Matrix.load(args.pam_filename) tree = TreeWrapper.get(path=args.tree_filename, schema=args.tree_schema) # Encode layers encoded_layers = encode_environment_layers(args.shapegrid_filename, args.layer) # Calculate PAM statistics stats_mtx = calculate_tree_site_statistics(pam, tree) if args.out_stats_matrix_filename: stats_mtx.write(args.out_stats_matrix_filename) # Join encoded layers and PAM statistics mtx = join_encoded_layers_and_pam_stats(encoded_layers, stats_mtx) # Generate GeoJSON geojson_data = create_geojson(args.shapegrid_filename, mtx) # Write GeoJSON with open(args.out_geojson_filename, 'w') as out_file: json.dump(geojson_data, out_file, indent=4) # Write matrix data new_rh = [] res = 0.5 for _, x, y in mtx.get_row_headers(): min_x = x - res max_x = x + res min_y = y - res max_y = y + res new_rh.append('"POLYGON (({} {},{} {},{} {},{} {},{} {}))"'.format( min_x, max_y, max_x, max_y, max_x, min_y, min_x, min_y, min_x, max_y)) mtx.write(args.out_matrix_filename) mtx.set_row_headers(new_rh) with open(args.out_csv_filename, 'w') as out_file: mtx.write_csv(out_file)
def test_package_valid(self, valid_ancestral_state_package): """Tests the calculate_continusous_ancestral_states method. Args: valid_ancestral_state_package (pytest.fixture): A parameterized pytest fixture defined in conftest.py that provides a valid test package. Note: * This test will need to evolve as the output format changes. It will probably be better to return a data structure with various values for each node rather than assigning the value to the node label. Raises: IOError: When the tree or alignment cannot be loaded for the specified file extension. Exception: When a specified successful result value cannot be found. """ # Get the data files (tree_filename, alignment_filename, results_filename) = valid_ancestral_state_package # Process the tree file _, tree_ext = os.path.splitext(tree_filename) if tree_ext == '.nex': tree_schema = 'nexus' elif tree_ext == '.xml': tree_schema = 'nexml' elif tree_ext == '.tre': tree_schema = 'newick' else: raise IOError( 'Cannot handle tree with extension: {}'.format(tree_ext)) # tree = dendropy.Tree.get(path=tree_filename, schema=tree_schema) tree = TreeWrapper.get(path=tree_filename, schema=tree_schema) # Process the alignment file _, align_ext = os.path.splitext(alignment_filename) if align_ext == '.csv': with open(alignment_filename) as align_file: sequences, headers = data_readers.read_csv_alignment_flo( align_file) elif align_ext == '.json': with open(alignment_filename) as align_file: sequences, headers = data_readers.read_json_alignment_flo( align_file) elif align_ext == '.phylip': with open(alignment_filename) as align_file: sequences = data_readers.read_phylip_alignment_flo(align_file) elif align_ext == '.tbl': with open(alignment_filename) as align_file: sequences = data_readers.read_table_alignment_flo(align_file) else: raise IOError('Cannot handle alignments with extension: {}'.format( align_ext)) char_mtx = data_readers.get_character_matrix_from_sequences_list( sequences) # Run analysis _, anc_mtx = anc_dp.calculate_continuous_ancestral_states( tree, char_mtx, calc_std_err=True, sum_to_one=False) # New testing method # (For now) assume that results file is csv with row headers for # node labels and column headers for variables results = [] h = None with open(results_filename) as results_file: for line in results_file: if h is None: # Get headers h = line.strip().split(',')[1:] else: # Add result (without label) to list node_result = [ float(i) for i in line.strip().split(',')[1:] ] results.append(np.array(node_result, dtype=float)) # Look for all results (only maximum likelihood) for row in anc_mtx[:, :, 0]: found = False for i in range(len(results)): # Allow for some wiggle room with decimal precision if np.all(np.isclose(row, results[i])): found = True results.pop(i) break if not found: raise Exception( 'Could not find expected result: {} in results'.format( row))
def test_package_valid(self, valid_ancestral_distribution_package): """Tests the calculate_ancestral_distributions method. Args: invalid_ancestral_distribution_package (pytest.fixture): A pytest fixture that is parametrized to provide invalid ancestral distributions, one at a time, so that there are multiple test functions defined for each invalid package. Raises: IOError: When the tree or alignment cannot be loaded for the specified file extension. Exception: When a specified successful result value cannot be found. """ # Get the data files (tree_filename, alignment_filename, results_filename) = valid_ancestral_distribution_package # Process the tree file _, tree_ext = os.path.splitext(tree_filename) if tree_ext == '.nex': tree_schema = 'nexus' elif tree_ext == '.xml': tree_schema = 'nexml' elif tree_ext == '.tre': tree_schema = 'newick' else: raise IOError( 'Cannot handle tree with extension: {}'.format(tree_ext)) # tree = dendropy.Tree.get(path=tree_filename, schema=tree_schema) tree = TreeWrapper.get(path=tree_filename, schema=tree_schema) # Process the alignment file _, align_ext = os.path.splitext(alignment_filename) if align_ext == '.csv': with open(alignment_filename) as align_file: sequences, headers = data_readers.read_csv_alignment_flo( align_file) elif align_ext == '.json': with open(alignment_filename) as align_file: sequences, headers = data_readers.read_json_alignment_flo( align_file) elif align_ext == '.phylip': with open(alignment_filename) as align_file: sequences = data_readers.read_phylip_alignment_flo(align_file) elif align_ext == '.tbl': with open(alignment_filename) as align_file: sequences = data_readers.read_table_alignment_flo(align_file) else: raise IOError('Cannot handle alignments with extension: {}'.format( align_ext)) char_mtx = data_readers.get_character_matrix_from_sequences_list( sequences) # Run analysis _, anc_mtx = anc_dp.calculate_ancestral_distributions(tree, char_mtx) # Testing method # Assume that the results file is a csv with row headers for node # labels and output layer (maximum_likeliehood / standard_error) # and column headers for variables ml_results = [] std_err_results = [] h = None with open(results_filename) as results_file: for line in results_file: if h is None: # Get headers h = line.strip().split(',')[1:] else: # Add result (without label) to appropriate list parts = line.strip().split(',') layer = parts[1].lower() values = np.array([float(i) for i in parts[2:]], dtype=float) if layer == 'maximum_likelihood': ml_results.append(values) else: std_err_results.append(values) assert (len(ml_results) == len(std_err_results)) print('ml results') print(ml_results) print('std err results') print(std_err_results) # Look for all results (ml and std err results should match rows) for row_idx in range(anc_mtx.shape[0]): found = False # Get rows from data ml_row = anc_mtx[row_idx, :, 0] std_err_row = anc_mtx[row_idx, :, 1] for i in range(len(ml_results)): print(ml_results[i]) print(std_err_results[i]) if np.all(np.isclose(ml_row, ml_results[i])) and \ np.all(np.isclose( std_err_row, std_err_results[i])): found = True ml_results.pop(i) std_err_results.pop(i) break if not found: raise Exception('Could not find {}, {} in results'.format( ml_row, std_err_row))