def main(): """Main method for script.""" parser = argparse.ArgumentParser() parser.add_argument('in_tree_filename', type=str, help='Path to initial tree') parser.add_argument('in_tree_schema', type=str, choices=['nexus', 'newick'], help='The input tree schema') parser.add_argument('out_tree_filename', type=str, help='Ouput tree path') parser.add_argument('out_tree_schema', type=str, choices=['nexus', 'newick'], help='The output tree schema') parser.add_argument('accepted_taxa_filename', type=str, help='File path to write out accepted taxon names') args = parser.parse_args() tree = TreeWrapper.get(path=args.in_tree_filename, schema=args.in_tree_schema) out_tree, accepted_taxa = get_and_replace_names(tree) # Write tree out_tree.write(path=args.out_tree_filename, schema=args.out_tree_schema) # Write accepted taxa with open(args.accepted_taxa_filename, 'w') as taxa_out_file: for taxon_name in accepted_taxa: taxa_out_file.write(taxon_name)
def test_valid(self): """Test the function with valid inputs.""" # Create a tree tree = TreeWrapper.get(data='(A,(B,((C,D),(E,F))));', schema='newick') mtx = Matrix(np.random.random((6, 2, 1)), headers={ '0': ['A', 'B', 'C', 'D', 'E', 'F'], '1': ['label', 'other_val'] }) # This should not fail annotators.annotate_tree_with_label(tree, mtx, label_column=0)
def main(): pam_fn = 'C:/Users/cj/Desktop/ryan_v3/pam.lmm' tree_fn = 'C:/Users/cj/Desktop/ryan_v3/squid_tree.nex' out_fn = 'C:/Users/cj/Desktop/ryan_v3/tree_mtx.lmm' with open(pam_fn, 'rb') as in_file: pam = Matrix.load_flo(in_file) tree = TreeWrapper.get(path=tree_fn, schema='nexus') tree_mtx = calculate_tree_site_statistics(pam, tree) with open(out_fn, 'wb') as out_file: tree_mtx.save(out_file) print(tree_mtx.max(axis=1)) print(tree_mtx.max(axis=0))
def purge_tree(tree_filename, tree_schema, occurrence_filename, species_col): """Get a tree and purge taxa not in occurrence data.""" tree = TreeWrapper.get(path=tree_filename, schema=tree_schema) species = set([]) with open(occurrence_filename, 'r') as in_file: for line in in_file: parts = line.split(', ') sp_name = parts[species_col].strip() species.add(sp_name) purge_taxa = [] for taxon in tree.taxon_namespace: if not taxon.label in species: purge_taxa.append(taxon) tree.prune_taxa(purge_taxa) tree.purge_taxon_namespace() return tree
def test_valid(self, tmpdir): """Test the function with valid inputs. Args: tmpdir (:obj:`py.path.local`): A temporary directory test fixture generated by pytest. """ # Create a tree tree = TreeWrapper.get(data='(A,(B,((C,D),(E,F))));', schema='newick') mtx = Matrix( np.random.random((6, 3, 2)), headers={'0': ['A', 'B', 'C', 'D', 'E', 'F'], '1': ['label', 'other_val', 'one_more_val']}) # This should not fail output_directory = os.path.join(tmpdir.dirname, 'plots') create_distribution_plots(tree, mtx, output_directory)
def get_squidded_tree(tree_fn, tree_schema, squid_json): tree = TreeWrapper.get(path=tree_fn, schema=tree_schema) json_data_lines = [] with open(squid_json) as in_file: first_line = True for line in in_file: if first_line: first_line = False else: json_data_lines.append(line) squid_list_json = json.loads(''.join(json_data_lines)) squid_dict = { i['scientific_name'].replace('_', ' '): i['header'] for i in squid_list_json } tree.annotate_tree_tips('squid', squid_dict) return tree
def main(): """Main method for script.""" parser = argparse.ArgumentParser() parser.add_argument('shapegrid_filename', type=str, help='File location of the shapegrid shapefile') parser.add_argument('pam_filename', type=str, help='File location of the PAM matrix for statistics') parser.add_argument('tree_filename', type=str, help='File location of the tree to use for statistics') parser.add_argument('tree_schema', choices=['newick', 'nexus'], help='The tree schema') parser.add_argument('out_geojson_filename', type=str, help='File location to write the output GeoJSON') parser.add_argument('--layer', nargs=2, action='append', help='File location of a layer followed by a label') args = parser.parse_args() # Load data pam = Matrix.load(args.pam_filename) tree = TreeWrapper.get(path=args.tree_filename, schema=args.tree_schema) # Encode layers encoded_layers = encode_environment_layers(args.shapegrid_filename, args.layer) # Calculate PAM statistics stats_mtx = calculate_tree_site_statistics(pam, tree) # Join encoded layers and PAM statistics mtx = join_encoded_layers_and_pam_stats(encoded_layers, stats_mtx) # Generate GeoJSON geojson_data = create_geojson(args.shapegrid_filename, mtx) # Write GeoJSON with open(args.out_geojson_filename, 'w') as out_file: json.dump(geojson_data, out_file)
def main(): parser = argparse.ArgumentParser() parser.add_argument('in_tree_filename', type=str, help='The file location of the input tree') parser.add_argument('in_tree_schema', type=str, choices=['newick', 'nexus'], help='The schema of the input tree') parser.add_argument('out_tree_filename', type=str, help='The file location of the output tree') parser.add_argument('out_tree_schema', type=str, choices=['newick', 'nexus'], help='The schema of the output tree') args = parser.parse_args() tree = TreeWrapper.get(path=args.in_tree_filename, schema=args.in_tree_schema) out_tree = get_and_replace_names(tree) out_tree.write(path=args.out_tree_filename, schema=args.out_tree_schema)
'-c', '--out_csv_filename', type=str, help='If provided, write the output character matrix CSV ' 'to this file location') args = parser.parse_args() # Check that input files exist if not os.path.exists(args.in_tree_filename): raise IOError( 'Input tree {} does not exist'.format(args.in_tree_filename)) if not os.path.exists(args.data_filename): raise IOError( 'Input data file {} does not exist'.format(args.data_filename)) # Read the tree tree = TreeWrapper.get( path=args.in_tree_filename, schema=args.in_tree_schema) # Read data if args.data_format == 'csv': with open(args.data_filename) as in_file: sequences, headers = data_readers.read_csv_alignment_flo( in_file) elif args.data_format == 'json': with open(args.data_filename) as in_file: sequences, headers = data_readers.read_json_alginment_flo( inf_file) elif args.data_format == 'phylip': with open(args.data_filename) as in_file: sequences = data_reders.read_phylip_alignment_flo(in_file) headers = None elif args.data_format == 'table':
def main(): """Main method for script.""" parser = argparse.ArgumentParser() parser.add_argument('--out_stats_matrix_filename', type=str, help='Location to write statistics matrix.') parser.add_argument('shapegrid_filename', type=str, help='File location of the shapegrid shapefile') parser.add_argument('pam_filename', type=str, help='File location of the PAM matrix for statistics') parser.add_argument('tree_filename', type=str, help='File location of the tree to use for statistics') parser.add_argument('tree_schema', choices=['newick', 'nexus'], help='The tree schema') parser.add_argument('out_geojson_filename', type=str, help='File location to write the output GeoJSON') parser.add_argument('out_csv_filename', type=str, help='File location to write the output CSV') parser.add_argument('out_matrix_filename', type=str, help='File location to write the output matrix') parser.add_argument('--layer', nargs=2, action='append', help='File location of a layer followed by a label') args = parser.parse_args() # Load data pam = Matrix.load(args.pam_filename) tree = TreeWrapper.get(path=args.tree_filename, schema=args.tree_schema) # Encode layers encoded_layers = encode_environment_layers(args.shapegrid_filename, args.layer) # Calculate PAM statistics stats_mtx = calculate_tree_site_statistics(pam, tree) if args.out_stats_matrix_filename: stats_mtx.write(args.out_stats_matrix_filename) # Join encoded layers and PAM statistics mtx = join_encoded_layers_and_pam_stats(encoded_layers, stats_mtx) # Generate GeoJSON geojson_data = create_geojson(args.shapegrid_filename, mtx) # Write GeoJSON with open(args.out_geojson_filename, 'w') as out_file: json.dump(geojson_data, out_file, indent=4) # Write matrix data new_rh = [] res = 0.5 for _, x, y in mtx.get_row_headers(): min_x = x - res max_x = x + res min_y = y - res max_y = y + res new_rh.append('"POLYGON (({} {},{} {},{} {},{} {},{} {}))"'.format( min_x, max_y, max_x, max_y, max_x, min_y, min_x, min_y, min_x, max_y)) mtx.write(args.out_matrix_filename) mtx.set_row_headers(new_rh) with open(args.out_csv_filename, 'w') as out_file: mtx.write_csv(out_file)
def test_package_valid(self, valid_ancestral_state_package): """Tests the calculate_continusous_ancestral_states method. Args: valid_ancestral_state_package (pytest.fixture): A parameterized pytest fixture defined in conftest.py that provides a valid test package. Note: * This test will need to evolve as the output format changes. It will probably be better to return a data structure with various values for each node rather than assigning the value to the node label. Raises: IOError: When the tree or alignment cannot be loaded for the specified file extension. Exception: When a specified successful result value cannot be found. """ # Get the data files (tree_filename, alignment_filename, results_filename) = valid_ancestral_state_package # Process the tree file _, tree_ext = os.path.splitext(tree_filename) if tree_ext == '.nex': tree_schema = 'nexus' elif tree_ext == '.xml': tree_schema = 'nexml' elif tree_ext == '.tre': tree_schema = 'newick' else: raise IOError( 'Cannot handle tree with extension: {}'.format(tree_ext)) # tree = dendropy.Tree.get(path=tree_filename, schema=tree_schema) tree = TreeWrapper.get(path=tree_filename, schema=tree_schema) # Process the alignment file _, align_ext = os.path.splitext(alignment_filename) if align_ext == '.csv': with open(alignment_filename) as align_file: sequences, headers = data_readers.read_csv_alignment_flo( align_file) elif align_ext == '.json': with open(alignment_filename) as align_file: sequences, headers = data_readers.read_json_alignment_flo( align_file) elif align_ext == '.phylip': with open(alignment_filename) as align_file: sequences = data_readers.read_phylip_alignment_flo(align_file) elif align_ext == '.tbl': with open(alignment_filename) as align_file: sequences = data_readers.read_table_alignment_flo(align_file) else: raise IOError('Cannot handle alignments with extension: {}'.format( align_ext)) char_mtx = data_readers.get_character_matrix_from_sequences_list( sequences) # Run analysis _, anc_mtx = anc_dp.calculate_continuous_ancestral_states( tree, char_mtx, calc_std_err=True, sum_to_one=False) # New testing method # (For now) assume that results file is csv with row headers for # node labels and column headers for variables results = [] h = None with open(results_filename) as results_file: for line in results_file: if h is None: # Get headers h = line.strip().split(',')[1:] else: # Add result (without label) to list node_result = [ float(i) for i in line.strip().split(',')[1:] ] results.append(np.array(node_result, dtype=float)) # Look for all results (only maximum likelihood) for row in anc_mtx[:, :, 0]: found = False for i in range(len(results)): # Allow for some wiggle room with decimal precision if np.all(np.isclose(row, results[i])): found = True results.pop(i) break if not found: raise Exception( 'Could not find expected result: {} in results'.format( row))
def test_package_valid(self, valid_ancestral_distribution_package): """Tests the calculate_ancestral_distributions method. Args: invalid_ancestral_distribution_package (pytest.fixture): A pytest fixture that is parametrized to provide invalid ancestral distributions, one at a time, so that there are multiple test functions defined for each invalid package. Raises: IOError: When the tree or alignment cannot be loaded for the specified file extension. Exception: When a specified successful result value cannot be found. """ # Get the data files (tree_filename, alignment_filename, results_filename) = valid_ancestral_distribution_package # Process the tree file _, tree_ext = os.path.splitext(tree_filename) if tree_ext == '.nex': tree_schema = 'nexus' elif tree_ext == '.xml': tree_schema = 'nexml' elif tree_ext == '.tre': tree_schema = 'newick' else: raise IOError( 'Cannot handle tree with extension: {}'.format(tree_ext)) # tree = dendropy.Tree.get(path=tree_filename, schema=tree_schema) tree = TreeWrapper.get(path=tree_filename, schema=tree_schema) # Process the alignment file _, align_ext = os.path.splitext(alignment_filename) if align_ext == '.csv': with open(alignment_filename) as align_file: sequences, headers = data_readers.read_csv_alignment_flo( align_file) elif align_ext == '.json': with open(alignment_filename) as align_file: sequences, headers = data_readers.read_json_alignment_flo( align_file) elif align_ext == '.phylip': with open(alignment_filename) as align_file: sequences = data_readers.read_phylip_alignment_flo(align_file) elif align_ext == '.tbl': with open(alignment_filename) as align_file: sequences = data_readers.read_table_alignment_flo(align_file) else: raise IOError('Cannot handle alignments with extension: {}'.format( align_ext)) char_mtx = data_readers.get_character_matrix_from_sequences_list( sequences) # Run analysis _, anc_mtx = anc_dp.calculate_ancestral_distributions(tree, char_mtx) # Testing method # Assume that the results file is a csv with row headers for node # labels and output layer (maximum_likeliehood / standard_error) # and column headers for variables ml_results = [] std_err_results = [] h = None with open(results_filename) as results_file: for line in results_file: if h is None: # Get headers h = line.strip().split(',')[1:] else: # Add result (without label) to appropriate list parts = line.strip().split(',') layer = parts[1].lower() values = np.array([float(i) for i in parts[2:]], dtype=float) if layer == 'maximum_likelihood': ml_results.append(values) else: std_err_results.append(values) assert (len(ml_results) == len(std_err_results)) print('ml results') print(ml_results) print('std err results') print(std_err_results) # Look for all results (ml and std err results should match rows) for row_idx in range(anc_mtx.shape[0]): found = False # Get rows from data ml_row = anc_mtx[row_idx, :, 0] std_err_row = anc_mtx[row_idx, :, 1] for i in range(len(ml_results)): print(ml_results[i]) print(std_err_results[i]) if np.all(np.isclose(ml_row, ml_results[i])) and \ np.all(np.isclose( std_err_row, std_err_results[i])): found = True ml_results.pop(i) std_err_results.pop(i) break if not found: raise Exception('Could not find {}, {} in results'.format( ml_row, std_err_row))