def test_transpose_trait_table_fields(self): """transpose_table_fields should correctly transpose table fields""" # The purpose is to make it easy to transpose tab-delimited tables # before analysis starts (and to have an internal method where needed # Get header test_header = "genomes\t123456\t123457\t123458\t123459\n" test_data_fields =[\ ["gene1",0,3,5,0],\ ["gene2",1,2,1,0],\ ["gene3",0,0,0,1]] exp_header = "genomes\tgene1\tgene2\tgene3\n" exp_data_fields =[\ ['123456',0,1,0],\ ['123457',3,2,0],\ ['123458',5,1,0],\ ['123459',0,0,1]] new_header,new_data_fields = transpose_trait_table_fields(test_data_fields,\ header=test_header,id_row_idx=0) self.assertEqual(new_header, exp_header) self.assertEqual(new_data_fields, exp_data_fields) pass
def test_transpose_trait_table_fields(self): """transpose_table_fields should correctly transpose table fields""" # The purpose is to make it easy to transpose tab-delimited tables # before analysis starts (and to have an internal method where needed # Get header test_header = "genomes\t123456\t123457\t123458\t123459\n" test_data_fields =[\ ["gene1",0,3,5,0],\ ["gene2",1,2,1,0],\ ["gene3",0,0,0,1]] exp_header = "genomes\tgene1\tgene2\tgene3\n" exp_data_fields =[\ ['123456',0,1,0],\ ['123457',3,2,0],\ ['123458',5,1,0],\ ['123459',0,0,1]] new_header,new_data_fields = transpose_trait_table_fields(test_data_fields,\ header=test_header,id_row_idx=0) self.assertEqual(new_header,exp_header) self.assertEqual(new_data_fields,exp_data_fields) pass
def main(): """Generate test trees given parameters""" option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading trait table..." input_trait_table = open(opts.input_trait_table,"U") if opts.verbose: print "Loading tree..." #PicrustNode seems to run into very slow/memory intentsive perfromance... #tree = DndParser(open(opts.input_tree),constructor=PicrustNode) tree = DndParser(open(opts.input_tree)) if opts.verbose: print "Parsing trait table..." #Find which taxa are to be used in tests #(by default trait table taxa) trait_table_header,trait_table_fields = \ parse_trait_table(input_trait_table) if opts.verbose: print "Ensuring tree and trait table labels are formatted consistently..." label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose) fix_tree_labels(tree,label_conversion_fns) trait_table_fields = convert_trait_table_entries(trait_table_fields,\ value_conversion_fns = [],\ label_conversion_fns = label_conversion_fns) trait_table_fields = [t for t in trait_table_fields] print "Number of trait table fields with single quotes:",\ len([t for t in trait_table_fields if "'" in t[0]]) if opts.verbose: print "Making output directory..." make_output_dir(opts.output_dir) if opts.limit_to_tips: included_tips = opts.limit_to_tips.split(",") if opts.verbose: print "Limiting test datasets to %i tips: %s" %(len(included_tips),included_tips) else: included_tips = False method_fns =\ {"exclude_tips_by_distance":\ make_distance_based_exclusion_fn,\ "randomize_tip_labels_by_distance":\ make_distance_based_tip_label_randomizer } test_fn_factory = method_fns[opts.method] if opts.verbose: print "Setting tree modification method to:", opts.method print "(%s)" % test_fn_factory.__doc__ modify_tree = True if opts.suppress_tree_modification: if opts.verbose: print "Suppressing modification of tree when making test datasets" modify_tree = False if opts.verbose: print "Starting generation of test datsets" test_datasets = \ yield_genome_test_data_by_distance(tree,trait_table_fields,\ test_fn_factory,min_dist = opts.min_dist,\ max_dist=opts.max_dist,increment=opts.dist_increment,\ modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose) if opts.verbose: print "Writing files for test datasets" for curr_dist,test_tree,tip_to_predict,\ expected_traits,test_trait_table_fields in test_datasets: if included_tips is not False: if tip_to_predict not in included_tips: if opts.verbose: print "Skipping tip %s: limiting to tip(s): %s" %(tip_to_predict,included_tips) continue #Make a safe version of tip to predict # So odd characters like | don't mess up OS safe_tip_to_predict = "'%s'"%tip_to_predict #Write tree base_name = "--".join(map(str,["test_tree",opts.method,curr_dist])) curr_filepath = write_tree(opts.output_dir,base_name,test_tree,safe_tip_to_predict) if opts.verbose: print "Wrote test tree to: %s" % curr_filepath #Write expected trait table base_name = "--".join(map(str,["exp_traits",opts.method,curr_dist,safe_tip_to_predict])) exp_trait_table_lines = [trait_table_header] exp_trait_table_lines.append("\t".join(expected_traits)+"\n") #print "Expected_trait_table_lines:",exp_trait_table_lines filename=os.path.join(opts.output_dir,base_name) if opts.verbose: print "Writing expected trait table to:", filename f=open(filename,"w") f.write("".join(exp_trait_table_lines)) f.close() #Output a transposed, BIOM format expectation table for comparison with predict_traits output #NOTE: this is a clumsy way of getting the translated trait table # but more elegant, direct methods (directly feeding data to biom's table_factory) # weren't working for me readily. In the future, we should streamline this process # Leaving as is for now since this code is mostly for developers so speed/elegence # are probably not essential here. #Let the hackishness begin #Reload the tab-delimited trait table header, fields = parse_trait_table(open(filename,"U")) fields = [f for f in fields] #converts generator to list #Transpose table for .BIOM format so that Observation ids are KOs transposed_header, transposed_trait_table_lines =\ transpose_trait_table_fields(fields,header,\ id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t") #Eliminate newline in header trans_trait_table_lines = [transposed_header.strip()] trans_trait_table_lines.extend(["\t".join(r) for r in transposed_trait_table_lines]) trans_trait_table = '\n'.join(trans_trait_table_lines) #Write BIOM format expected trait table base_name = "--".join(map(str,["exp_biom_traits",opts.method,curr_dist,safe_tip_to_predict])) expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\ table_format = "tab-delimited") #print "Expected_trait_table_lines:",exp_trait_table_lines filename=os.path.join(opts.output_dir,base_name) if opts.verbose: print "Writing BIOM-format expected trait table to:", filename f=open(filename,"w") f.write(format_biom_table(expected_biom_table)) f.close() #Write test trait table test_trait_table_fields = test_trait_table_fields if expected_traits in test_trait_table_fields: test_trait_table_fields.remove(expected_traits) test_trait_table_lines = [trait_table_header] test_trait_table_lines.extend(["\t".join(r)+"\n" for r in test_trait_table_fields]) #print "Test_trait_table_lines:",test_trait_table_lines base_name = "--".join(map(str,["test_trait_table",opts.method,curr_dist,safe_tip_to_predict])) filename=os.path.join(opts.output_dir,base_name) if opts.verbose: print "Writing test trait table to:", filename f=open(filename,"w") f.write("".join(test_trait_table_lines)) f.close() if opts.verbose: print "Done generating test datasets"
def main(): """Generate test trees given parameters""" option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.verbose: print "Loading trait table..." input_trait_table = open(opts.input_trait_table, "U") if opts.verbose: print "Loading tree..." #PicrustNode seems to run into very slow/memory intentsive perfromance... #tree = DndParser(open(opts.input_tree),constructor=PicrustNode) tree = DndParser(open(opts.input_tree)) if opts.verbose: print "Parsing trait table..." #Find which taxa are to be used in tests #(by default trait table taxa) trait_table_header,trait_table_fields = \ parse_trait_table(input_trait_table) if opts.verbose: print "Ensuring tree and trait table labels are formatted consistently..." label_conversion_fns = set_label_conversion_fns(verbose=opts.verbose) fix_tree_labels(tree, label_conversion_fns) trait_table_fields = convert_trait_table_entries(trait_table_fields,\ value_conversion_fns = [],\ label_conversion_fns = label_conversion_fns) trait_table_fields = [t for t in trait_table_fields] print "Number of trait table fields with single quotes:",\ len([t for t in trait_table_fields if "'" in t[0]]) if opts.verbose: print "Making output directory..." make_output_dir(opts.output_dir) if opts.limit_to_tips: included_tips = opts.limit_to_tips.split(",") if opts.verbose: print "Limiting test datasets to %i tips: %s" % ( len(included_tips), included_tips) else: included_tips = False method_fns =\ {"exclude_tips_by_distance":\ make_distance_based_exclusion_fn,\ "randomize_tip_labels_by_distance":\ make_distance_based_tip_label_randomizer } test_fn_factory = method_fns[opts.method] if opts.verbose: print "Setting tree modification method to:", opts.method print "(%s)" % test_fn_factory.__doc__ modify_tree = True if opts.suppress_tree_modification: if opts.verbose: print "Suppressing modification of tree when making test datasets" modify_tree = False if opts.verbose: print "Starting generation of test datsets" test_datasets = \ yield_genome_test_data_by_distance(tree,trait_table_fields,\ test_fn_factory,min_dist = opts.min_dist,\ max_dist=opts.max_dist,increment=opts.dist_increment,\ modify_tree=modify_tree,limit_to_tips= included_tips,verbose = opts.verbose) if opts.verbose: print "Writing files for test datasets" for curr_dist,test_tree,tip_to_predict,\ expected_traits,test_trait_table_fields in test_datasets: if included_tips is not False: if tip_to_predict not in included_tips: if opts.verbose: print "Skipping tip %s: limiting to tip(s): %s" % ( tip_to_predict, included_tips) continue #Make a safe version of tip to predict # So odd characters like | don't mess up OS safe_tip_to_predict = "'%s'" % tip_to_predict #Write tree base_name = "--".join(map(str, ["test_tree", opts.method, curr_dist])) curr_filepath = write_tree(opts.output_dir, base_name, test_tree, safe_tip_to_predict) if opts.verbose: print "Wrote test tree to: %s" % curr_filepath #Write expected trait table base_name = "--".join( map(str, ["exp_traits", opts.method, curr_dist, safe_tip_to_predict])) exp_trait_table_lines = [trait_table_header] exp_trait_table_lines.append("\t".join(expected_traits) + "\n") #print "Expected_trait_table_lines:",exp_trait_table_lines filename = os.path.join(opts.output_dir, base_name) if opts.verbose: print "Writing expected trait table to:", filename f = open(filename, "w") f.write("".join(exp_trait_table_lines)) f.close() #Output a transposed, BIOM format expectation table for comparison with predict_traits output #NOTE: this is a clumsy way of getting the translated trait table # but more elegant, direct methods (directly feeding data to biom's table_factory) # weren't working for me readily. In the future, we should streamline this process # Leaving as is for now since this code is mostly for developers so speed/elegence # are probably not essential here. #Let the hackishness begin #Reload the tab-delimited trait table header, fields = parse_trait_table(open(filename, "U")) fields = [f for f in fields] #converts generator to list #Transpose table for .BIOM format so that Observation ids are KOs transposed_header, transposed_trait_table_lines =\ transpose_trait_table_fields(fields,header,\ id_row_idx=0, input_header_delimiter="\t",output_delimiter="\t") #Eliminate newline in header trans_trait_table_lines = [transposed_header.strip()] trans_trait_table_lines.extend( ["\t".join(r) for r in transposed_trait_table_lines]) trans_trait_table = '\n'.join(trans_trait_table_lines) #Write BIOM format expected trait table base_name = "--".join( map(str, [ "exp_biom_traits", opts.method, curr_dist, safe_tip_to_predict ])) expected_biom_table = parse_table_to_biom(trans_trait_table.split('\n'),\ table_format = "tab-delimited") #print "Expected_trait_table_lines:",exp_trait_table_lines filename = os.path.join(opts.output_dir, base_name) if opts.verbose: print "Writing BIOM-format expected trait table to:", filename f = open(filename, "w") f.write(format_biom_table(expected_biom_table)) f.close() #Write test trait table test_trait_table_fields = test_trait_table_fields if expected_traits in test_trait_table_fields: test_trait_table_fields.remove(expected_traits) test_trait_table_lines = [trait_table_header] test_trait_table_lines.extend( ["\t".join(r) + "\n" for r in test_trait_table_fields]) #print "Test_trait_table_lines:",test_trait_table_lines base_name = "--".join( map(str, [ "test_trait_table", opts.method, curr_dist, safe_tip_to_predict ])) filename = os.path.join(opts.output_dir, base_name) if opts.verbose: print "Writing test trait table to:", filename f = open(filename, "w") f.write("".join(test_trait_table_lines)) f.close() if opts.verbose: print "Done generating test datasets"