def pool_test_dataset_dir(obs_dir_fp,exp_dir_fp,file_name_delimiter="--",\ file_name_field_order=\ {'file_type':0,"prediction_method":1,"weighting_method":2,"holdout_method":3,\ "distance":4,"organism":5},strict=False, verbose=True,pool_by=['distance']): """Retrun pooled control & evaluation results from the given directories obs_dir_fp -- directory containing PICRUST-predicted genomes. These MUST start with 'predict_traits', and must contain the values specified in file_name_field_order,\ separated by the delimiter given in file_name_delimiter. For example: predict_traits--exclude_tips_by_distance--0.87--'NC_000913|646311926' exp_dir_fp -- as obs_dir_fp above, but expectation file names (usually sequenced genomes with known gene content) must start with exp_biom_traits file_name_delimiter -- the delimiter that separates metadata stored in the filename NOTE: technically this isn't the best way of doing things. We may want at some point to revisit this setup and store metadata about each comparison in a separate file. But storing in the filename is convenient for our initial analysis. file_name_field_order -- the order of the required metadata fields in the filename. Required fields are file_type,method,distance,and organism pool_by -- if passed, concatenate traits from each trial that is identical in this category. e.g. pool_by 'distance' will pool traits across individual test genomes with the same holdout distance. The method assumes that for each file type in the observed directory, a paired file is also found in the exp_dir with similar method, distance, and organism, but a varied file type (test_tree, test_trait_table) Process: 1. Search test directory for all gene predictions in the correct format 2. For each, find the corresponding expected trait table in the expectation file 3. Pool by specified pool_by values 4. Return dicts of pooled observation,expectation values """ trials = defaultdict(list) #We'll want a quick unzip fn for converting points to trials #TODO: separate out into a 'get_paired_data_from_dirs' function pooled_observations = {} pooled_expectations = {} pairs = iter_prediction_expectation_pairs(obs_dir_fp,exp_dir_fp,file_name_field_order,file_name_delimiter,verbose=verbose) file_number = 0 for obs_table,exp_table,filename in pairs: #print "analyzing filename:",filename filename_metadata= get_metadata_from_filename(filename,file_name_field_order,\ file_name_delimiter,verbose=verbose) #base_tag = '%s\t%s\t' %(filename_metadata['holdout_method'],filename_metadata['prediction_method']) #tags = [base_tag+'all_results'] if 'file_type' in pool_by: pool_by.remove('file_type') #we do this manually at the end combined_tag = ['all']*len(file_name_field_order.keys()) for field in file_name_field_order.keys(): #print combined_tag #print file_name_field_order idx = file_name_field_order[field] #print idx if field in pool_by: combined_tag[idx] = filename_metadata[field] tags=[file_name_delimiter.join(combined_tag)] if verbose: print "Pooling by:", pool_by print "Combined tags:",tags pooled_observations,pooled_expectations =\ update_pooled_data(obs_table,exp_table,tags,pooled_observations,\ pooled_expectations,str(file_number),verbose=verbose) file_number += 1 return pooled_observations,pooled_expectations
print "Missing expectation file....skipping!" continue base_tag = '%s\t%s\t' %(holdout_method,prediction_method) tags = [base_tag+'all_results'] combined_tag = base_tag +\ "\t".join([str(field)+"_"+str(filename_components[file_name_field_order[field]]) for field in pool_by]) tags.append(combined_tag) #if verbose: # print "Pooling by:", pool_by # print "Combined tags:",tags #TODO: abstract out pooling into its own function non_pooled_fields = [filename_components.get(file_name_field_order[k],None) for k in file_name_field_order.keys() if k not in pool_by] pooled_observations,pooled_expectations =\ update_pooled_data(obs_table,exp_table,tags,pooled_observations,\ pooled_expectations,str(file_number),verbose=verbose) #if verbose: # for tag in pooled_observations.keys(): # print "Merged obs biom:", pooled_observations[tag] # print "\nMedged *exp* biom:", pooled_expectations[tag] return run_accuracy_calculations_on_pooled_data(pooled_observations,\ pooled_expectations,roc_success_criteria=roc_success_criteria,verbose=verbose) def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) pool_by = opts.pool_by.split(',')
def pool_test_dataset_dir(obs_dir_fp,exp_dir_fp,file_name_delimiter="--",\ file_name_field_order=\ {'file_type':0,"prediction_method":1,"weighting_method":2,"holdout_method":3,\ "distance":4,"organism":5},strict=False, verbose=True,pool_by=['distance']): """Retrun pooled control & evaluation results from the given directories obs_dir_fp -- directory containing PICRUST-predicted genomes. These MUST start with 'predict_traits', and must contain the values specified in file_name_field_order,\ separated by the delimiter given in file_name_delimiter. For example: predict_traits--exclude_tips_by_distance--0.87--'NC_000913|646311926' exp_dir_fp -- as obs_dir_fp above, but expectation file names (usually sequenced genomes with known gene content) must start with exp_biom_traits file_name_delimiter -- the delimiter that separates metadata stored in the filename NOTE: technically this isn't the best way of doing things. We may want at some point to revisit this setup and store metadata about each comparison in a separate file. But storing in the filename is convenient for our initial analysis. file_name_field_order -- the order of the required metadata fields in the filename. Required fields are file_type,method,distance,and organism pool_by -- if passed, concatenate traits from each trial that is identical in this category. e.g. pool_by 'distance' will pool traits across individual test genomes with the same holdout distance. The method assumes that for each file type in the observed directory, a paired file is also found in the exp_dir with similar method, distance, and organism, but a varied file type (test_tree, test_trait_table) Process: 1. Search test directory for all gene predictions in the correct format 2. For each, find the corresponding expected trait table in the expectation file 3. Pool by specified pool_by values 4. Return dicts of pooled observation,expectation values """ #We'll want a quick unzip fn for converting points to trials #TODO: separate out into a 'get_paired_data_from_dirs' function pooled_observations = {} pooled_expectations = {} pairs = iter_prediction_expectation_pairs(obs_dir_fp, exp_dir_fp, file_name_field_order, file_name_delimiter, verbose=verbose) file_number = 0 for obs_table, exp_table, filename in pairs: #print "analyzing filename:",filename filename_metadata= get_metadata_from_filename(filename,file_name_field_order,\ file_name_delimiter,verbose=verbose) #base_tag = '%s\t%s\t' %(filename_metadata['holdout_method'],filename_metadata['prediction_method']) #tags = [base_tag+'all_results'] if 'file_type' in pool_by: pool_by.remove('file_type') #we do this manually at the end combined_tag = ['all'] * len(file_name_field_order.keys()) for field in file_name_field_order.keys(): #print combined_tag #print file_name_field_order idx = file_name_field_order[field] #print idx if field in pool_by: combined_tag[idx] = filename_metadata[field] tags = [file_name_delimiter.join(combined_tag)] if verbose: print "Pooling by:", pool_by print "Combined tags:", tags pooled_observations,pooled_expectations =\ update_pooled_data(obs_table,exp_table,tags,pooled_observations,\ pooled_expectations,str(file_number),verbose=verbose) file_number += 1 return pooled_observations, pooled_expectations
if strict: raise IOError(e) else: if verbose: print "Missing expectation file....skipping!" continue base_tag = '%s\t%s\t' %(holdout_method,prediction_method) tags = [base_tag+'all_results'] combined_tag = base_tag +\ "\t".join([str(field)+"_"+str(filename_components[file_name_field_order[field]]) for field in pool_by]) tags.append(combined_tag) #TODO: abstract out pooling into its own function non_pooled_fields = [filename_components.get(file_name_field_order[k],None) for k in file_name_field_order.keys() if k not in pool_by] pooled_observations,pooled_expectations =\ update_pooled_data(obs_table,exp_table,tags,pooled_observations,\ pooled_expectations,str(file_number),verbose=verbose) return run_accuracy_calculations_on_pooled_data(pooled_observations,\ pooled_expectations,roc_success_criteria=roc_success_criteria,verbose=verbose) def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) pool_by = opts.pool_by.split(',') #create output directory make_output_dir(opts.output_dir) #Construct a dict from user specified field order file_name_field_order = {}