def _parse_joint_site_frequencies(self, filepath, field_name_prefix, is_normalize_by_site_counts, lineage_pair, locus_definition, results_d): with utility.universal_open(filepath) as src: lines = src.read().split("\n") col_keys = [ c for c in lines[1].split("\t")[1:] if "sites with multiple" not in c ] row_idx = 0 for line in lines[2:]: if not line: continue cols = line.split("\t") if len(cols) - 1 != len(col_keys): raise ValueError( "Row {}: Expecting {} columns, but found {}: {}". format(row_idx + 1, len(col_keys), len(cols) - 1, cols)) assert len(cols) - 1 == len(col_keys) row_key = cols[0] col_idx = 0 for col_key, val in zip(col_keys, cols[1:]): # results_d["{}.{}.{}".format(field_name_prefix, row_key, col_key)] = float(val) val = float(val) if is_normalize_by_site_counts: val = val / locus_definition.num_sites results_d["{}.{}.{}".format(field_name_prefix, row_idx, col_idx)] = val col_idx += 1 row_idx += 1 return results_d
def _generate_parameter_file( self, fsc2_config_d, ): assert self.parameter_filepath with utility.universal_open( os.path.join(self.working_directory, self.parameter_filepath), "w") as dest: self._write_parameter_configuration( dest=dest, fsc2_config_d=fsc2_config_d, )
def _parse_raw_results_dna(self): data_dict = collections.OrderedDict() decodings = list(itertools.permutations("ACGT", 4)) column_decodings = {} with utility.universal_open(self.arlequin_filepath) as src: idx = 0 first_pop_max_ind_id = 0 in_alignment = False for row in src: row = row[:-1] # chomp terminating \n if in_alignment: if row == "": if idx == 2: break else: in_alignment = False else: try: x1, x2, encoded_data = row.split("\t") decoded_data = [] for col_idx, ch in enumerate(encoded_data): try: decoding_lookup = column_decodings[col_idx] except KeyError: decoding_lookup = self.rng.choice( decodings) column_decodings[col_idx] = decoding_lookup decoded_data.append(decoding_lookup[int(ch)]) pop_id, ind_id = [ int(label_part) for label_part in x1.split("_") ] # this ugly hack is because fastsimcoal inconsistently numbers # individuals on trees vs sequences ... even # assuming there is a correspondence if pop_id == 1: first_pop_max_ind_id = max( first_pop_max_ind_id, ind_id) else: ind_id += first_pop_max_ind_id taxon_label = self._compose_raw_data_taxon_label( population_id=pop_id, individual_id=ind_id) # taxon_label = self._compose_raw_data_taxon_label( # population_id=idx, # individual_id=x1) data_dict[taxon_label] = "".join(decoded_data) except IndexError: raise elif "SampleData=" in row: idx += 1 in_alignment = True return dendropy.DnaCharacterMatrix.from_dict(data_dict)
def process(self, target_data_filepath, priors_data_filepaths, output_name_prefix, output_directory, output_suffix): if output_name_prefix is None: output_name_prefix = os.path.splitext( os.path.basename(target_data_filepath))[0] if output_suffix is None: output_suffix = "" else: output_suffix = "." + output_suffix with utility.universal_open(target_data_filepath) as src: target_data_reader = csv.DictReader(src, delimiter=self.field_delimiter, quoting=csv.QUOTE_NONE) for target_row_idx, target_row in enumerate(target_data_reader): if target_row_idx == 0: self.stat_fieldnames, self.non_stat_fieldnames = self.extract_stat_fieldnames( target_data_reader.fieldnames) self.stat_fieldnames_set = set(self.stat_fieldnames) self.run_logger.info( "Scoring target data {}".format(target_row_idx + 1)) target_data_vector = self.extract_stats_data_vector_from_csv_row( target_row) posteriors_filepath = os.path.join( output_directory, "{}.posterior.{:03d}.samples{}.tsv".format( output_name_prefix, target_row_idx + 1, output_suffix)) self.accept_reject(target_data_vector=target_data_vector, priors_data_filepaths=priors_data_filepaths, output_filepath=posteriors_filepath) if self.is_output_target_params: target_params_filepath = os.path.join( output_directory, "{}.posterior.{:03d}.target{}.tsv".format( output_name_prefix, target_row_idx + 1, output_suffix)) with open(target_params_filepath, "w") as target_params_f: target_params_f.write( self.field_delimiter.join( self.non_stat_fieldnames)) target_params_f.write("\n") target_params_f.write( self.field_delimiter.join( str(target_row[k]) for k in self.non_stat_fieldnames)) target_params_f.write("\n")
def _parse_deme_site_frequencies(self, filepath, field_name_prefix, is_normalize_by_site_counts, lineage_pair, locus_definition, results_d): with utility.universal_open(filepath) as src: lines = src.read().split("\n") assert len(lines) == 4 and lines[3] == "" header_row = lines[1].split("\t") results_d_row = lines[2].split("\t") assert len(header_row) == len(results_d_row) for key, val in zip(header_row, results_d_row): if not val: continue key = "{}.{}".format(field_name_prefix, key) val = float(val) if is_normalize_by_site_counts: val = val / locus_definition.num_sites results_d[key] = val return results_d
def write_results(self): for file_idx, file_info in enumerate(self.file_infos): output_filepath = self.compose_output_path_f( file_info.filepath, file_idx) self.run_logger.info("Writing file {} of {}: '{}'".format( file_idx + 1, len(self.file_infos), output_filepath)) with utility.universal_open(output_filepath, "w") as dest: writer = utility.get_csv_writer( dest=dest, fieldnames=file_info.fieldnames, delimiter=self.field_delimiter, restval=self.missing_data_value, ) writer.writeheader() for data_row_idx in range(*file_info.data_row_idx_range): if self.logging_frequency and data_row_idx > 0 and ( data_row_idx % self.logging_frequency) == 0: self.run_logger.info( "- Writing row {}".format(data_row_idx + 1)) row = {} for field_name in file_info.fieldnames: row[field_name] = self.fields[field_name][data_row_idx] writer.writerow(row)
def accept_reject(self, target_data_vector, priors_data_filepaths, output_filepath): if self.rejection_criteria_type == "num": num_to_retain = self.rejection_criteria_value else: num_to_retain = None dest = utility.universal_open(output_filepath, "w") all_prior_fieldnames = [] all_prior_fieldnames_set = None accepted_heap = [] for fidx, priors_data_filepath in enumerate(priors_data_filepaths): self.run_logger.info( "Reading simulation file {} of {}: '{}'".format( fidx + 1, len(priors_data_filepaths), priors_data_filepath)) with utility.universal_open(priors_data_filepath) as src: priors_data_reader = csv.DictReader( src, delimiter=self.field_delimiter, quoting=csv.QUOTE_NONE) for row_idx, row in enumerate(priors_data_reader): if self.logging_frequency and row_idx > 0 and row_idx % self.logging_frequency == 0: self.run_logger.info( "Reading simulation file {} of {}, row {}".format( fidx + 1, len(priors_data_filepaths), row_idx + 1)) if row_idx == 0: if fidx == 0: all_prior_fieldnames = list( priors_data_reader.fieldnames) all_prior_fieldnames_set = set( all_prior_fieldnames) current_file_stat_fieldnames = set( self.extract_stat_fieldnames( priors_data_reader.fieldnames)[0]) s1 = current_file_stat_fieldnames - self.stat_fieldnames_set if s1: raise ValueError( "File '{}': Following summary statistics fields not found in target: {}" .format(priors_data_filepath, ", ".join(s1))) s2 = self.stat_fieldnames_set - current_file_stat_fieldnames if s2: raise ValueError( "File '{}': Following summary statistics fields given in target but not found here: {}" .format(priors_data_filepath, ", ".join(s2))) header_row = [] for fnidx, fn in enumerate(all_prior_fieldnames): if self.is_write_summary_stats or fn not in self.stat_fieldnames_set: header_row.append(fn) if self.is_write_rejection_score: header_row.append( self.distance_score_fieldname) header_row.append( self.normalized_distance_score_fieldname) dest.write("{}\n".format( self.field_delimiter.join(header_row))) else: current_file_fieldnames = set( priors_data_reader.fieldnames) s1 = current_file_fieldnames - all_prior_fieldnames_set if s1: raise ValueError( "File '{}': Following fields found, but not found in previous files: {}" .format(priors_data_filepath, ", ".join(s1))) s2 = all_prior_fieldnames_set - current_file_fieldnames if s2: raise ValueError( "File '{}': Following fields found in previous files, but not found here: {}" .format(priors_data_filepath, ", ".join(s2))) try: prior_data_vector = self.extract_stats_data_vector_from_csv_row( row) except SisterBayesRejectorStatsVectorValueException: if self.is_ignore_invalid_priors_data_vectors: continue else: raise except SisterBayesRejectorStatsVectorSizeException: if self.is_ignore_invalid_priors_data_vectors: continue else: raise distance_score = self.euclidean_distance( target_data_vector, prior_data_vector) row_values = self.field_delimiter.join( row[fn] for fn in priors_data_reader.fieldnames if self.is_write_summary_stats or fn not in self.stat_fieldnames_set) if self.is_write_rejection_score: row_values = "{}{}{}".format(row_values, self.field_delimiter, distance_score) # Normalize by number of comparisons # How do we get this? # Consider the following vectors: # > x1 = c(3.1, 3.1, 3.1) # > x2 = c(5.1, 5.1, 5.1) # > y1 = c(3.1, 3.1, 3.1, 3.1, 3.1) # > y2 = c(5.1, 5.1, 5.1, 5.1, 5.1) # The naive/raw Euclidean distances are different, due to the different number of comparisons: # > sqrt(sum((x2-x1)**2)) # [1] 3.464102 # > sqrt(sum((y2-y1)**2)) # [1] 4.472136 # But dividing the be sqrt of the length of the vectors makes them equal: # > sqrt(sum((x2-x1)**2)) / sqrt(3) # [1] 2 # > sqrt(sum((y2-y1)**2)) / sqrt(5) # [1] 2 normalized_distance_score = distance_score / math.sqrt( len(target_data_vector)) row_values = "{}{}{}".format( row_values, self.field_delimiter, normalized_distance_score) heap_score = -1 * (distance_score) heap_entry = (heap_score, row_values) if self.rejection_criteria_type == "distance": if distance_score <= self.rejection_criteria_value: accepted_heap.append(heap_entry) elif self.rejection_criteria_type == "num": if len(accepted_heap) < num_to_retain: accepted_heap.append(heap_entry) if len(accepted_heap) == num_to_retain: heapq.heapify(accepted_heap) else: heapq.heappushpop(accepted_heap, heap_entry) else: raise NotImplementedError(self.rejection_criteria_type) # for fnidx, fn in enumerate(all_prior_fieldnames): # value = row[fn] # if self.is_write_summary_stats or fn not in self.stat_fieldnames_set: # dest.write("{}{}".format(value, self.field_delimiter)) # dest.write("{}\n".format(distance)) accepted_heap.sort(reverse=True) for hidx, heap_entry in enumerate(accepted_heap): heap_entry = accepted_heap[hidx] dest.write(heap_entry[1]) dest.write("\n") dest.flush() dest.close()
def main(): parser = argparse.ArgumentParser() package_id = sisterbayes.package_id() parser.add_argument("--version", action="version", version=package_id) simulator_options = parser.add_argument_group("Simulation Configuration") simulator_options.add_argument( "configuration_filepath", metavar="CONFIGURATION-FILE", help="Path to file defining the simulation model and parameters.") output_options = parser.add_argument_group("Output Options") output_options.add_argument( '-o', '--output-name-prefix', action='store', dest='output_name_prefix', type=str, default=None, metavar='NAME-PREFIX', help= "Prefix for output filenames (default: same as configuration filename stem)." ) output_options.add_argument( '-O', '--output-directory', action='store', dest='output_directory', type=str, default=None, metavar='DIRECTORY', help="Directory for output files (default: current working directory)." ) output_options.add_argument( "-U", "--unfolded-site-frequency-spectrum", "--derived-site-frequency-spectrum", action="store_true", default=False, help="Calculate the unfolded or derived site frequency spectrum." " Otherwise, defaults to the folded or minor site frequency" " spectrum.") output_options.add_argument( "--infinite-sites-model", action="store_true", default=False, help="Use infinite sites model instead of finite sites.") output_options.add_argument( "--calculate-single-population-site-frequency-spectrum", action="store_true", default=False, help="Calculate the single (within) population site frequency" " spectrum in addition to the joint.") output_options.add_argument( "--concatenate-loci", action="store_true", default=False, help="Concatenate statistics for all loci into one effective locus.", ) output_options.add_argument( "--no-normalize-by-concatenated-loci-count", dest="is_normalize_by_concatenated_num_loci", action="store_false", default=True, help= "If concatenating loci, do NOT normalize frequency spectrum values by number of loci.", ) output_options.add_argument( "--no-normalize-by-site-counts", dest="normalize_by_site_counts", action="store_false", default=True, help= "Do *not* normalize frequency spectrum values by number of sites in each locus." ) output_options.add_argument( "-l", "--labels", action="append", help= "Addition field/value pairs to add to the output (in format <FIELD-NAME>:value;)" ) output_options.add_argument( '--field-delimiter', type=str, default='\t', help="Delimiter string separating fields in output (default: <TAB>').") output_options.add_argument( '--summary-stats-label-prefix', type=str, default='stat', metavar='PREFIX', help= "Prefix for summar statistic field labels (default: '%(default)s').") output_options.add_argument( "--include-model-id-field", action="store_true", default=False, help= "Include a 'model.id' field (with same value as 'param.divTimeModel' field) in output." ) output_options.add_argument( "--append", action="store_true", default=False, help="Append instead of overwriting output file(s).") output_options.add_argument("--no-write-header", action="store_true", default=False, help="Do not writer header row.") output_options.add_argument("--raw-data", action="store_true", default=False, help="Output raw data (alignments and trees).") output_options.add_argument("--raw-data-alignment", action="store_true", default=False, help="Output raw alignment.") output_options.add_argument("--raw-data-mutation-tree", action="store_true", default=False, help="Output raw mutation tree.") output_options.add_argument("--raw-data-true-tree", action="store_true", default=False, help="Output raw true tree.") output_options.add_argument( "--raw-data-alignment-format", default="fasta", choices=["fasta", "phylip", "nexus"], help= "Format for the raw data alignments ('fasta', 'phylip', or 'nexus'; default='fasta')." ) output_options.add_argument( "--raw-data-tree-format", default="nexus", choices=["nexus", "newick", "nexml"], help= "Format for the raw data trees ('nexus', 'newick', or 'nexml'; default='nexus')." ) output_options.add_argument( "--params-only-file", action="store_true", default=False, help= "Output file consisting of parameters only (for checking/validation).") run_options = parser.add_argument_group("Run Options") run_options.add_argument( "-n", "--num-reps", type=int, default=1, help="Number of replicates (default: %(default)s).") run_options.add_argument( "-m", "--num-processes", default=1, type=int, help="Number of processes/CPU to run (default: %(default)s).") run_options.add_argument("-z", "--random-seed", default=None, help="Seed for random number generator engine.") run_options.add_argument("-q", "--quiet", action="store_true", help="Work silently.") run_options.add_argument('--log-to-file', action='store_true', dest='log_to_file', default=None, help="Save log to file.") run_options.add_argument( "--log-frequency", default=None, type=int, help= "Frequency that background progress messages get written to the log (0: do not log informational messages)." ) run_options.add_argument( "--file-logging-level", default="none", choices=[ "debug", "info", "warning", "error", "critical", "none", ], help="Message level threshold for screen logs (default: %(default)s).") run_options.add_argument( "--stderr-logging-level", default="info", choices=[ "debug", "info", "warning", "error", "critical", "none", ], help="Message level threshold for screen logs (default: %(default)s).") run_options.add_argument( '-w', '--working-directory-parent', action='store', type=str, default=None, help="Directory within which to create temporary directories and files." ) run_options.add_argument("--no-cleanup", action="store_true", default=False, help="Do not clean-up temporary files.") run_options.add_argument("--debug-mode", action="store_true", default=False, help="Run in debugging mode.") fsc2_options = parser.add_argument_group("FastSimCoal2 Options") fsc2_options.add_argument( "--fsc2-path", metavar="FSC2-PATH", default=os.environ.get("SISTERBAYES_FSC2_PATH", "fsc"), help="Path to FastsimCoal2 application (default: %(default)s).") args = parser.parse_args() config_d = {} if not os.path.exists(args.configuration_filepath): sys.exit("ERROR: Configuration file '{}' not found.".format( args.configuration_filepath)) utility.parse_legacy_configuration(filepath=args.configuration_filepath, config_d=config_d) config_d["output_prefix"] = utility.output_prefix( primary_source_filepath=args.configuration_filepath, output_name_prefix=args.output_name_prefix, output_directory=args.output_directory) if args.log_frequency is None: config_d["logging_frequency"] = int(args.num_reps / 10.0) elif args.log_frequency == 0: config_d["logging_frequency"] = None else: config_d["logging_frequency"] = args.log_frequency config_d["fsc2_path"] = args.fsc2_path if utility.which(config_d["fsc2_path"]) is None: sys.exit("ERROR: FastSimCoal2 executable '{}' not found.\n" "Install FastSimCoal2 and specify path to the executable\n" "using the '--fsc2-path' argument.".format( config_d["fsc2_path"])) config_d["file_logging_level"] = args.file_logging_level config_d["standard_error_logging_level"] = args.stderr_logging_level config_d["log_to_file"] = args.log_to_file config_d["log_to_stderr"] = not args.quiet config_d[ "is_unfolded_site_frequency_spectrum"] = args.unfolded_site_frequency_spectrum config_d[ "is_calculate_single_population_sfs"] = args.calculate_single_population_site_frequency_spectrum config_d["is_calculate_joint_population_sfs"] = True config_d["is_infinite_sites_model"] = args.infinite_sites_model config_d["stat_label_prefix"] = args.summary_stats_label_prefix config_d["supplemental_labels"] = utility.parse_fieldname_and_value( args.labels) config_d["field_delimiter"] = args.field_delimiter config_d["is_include_model_id_field"] = args.include_model_id_field config_d["is_concatenate_loci"] = args.concatenate_loci config_d[ "is_normalize_by_concatenated_num_loci"] = args.is_normalize_by_concatenated_num_loci config_d["is_normalize_by_site_counts"] = args.normalize_by_site_counts is_store_raw_alignment = False is_store_raw_mutation_tree = False is_store_raw_true_tree = False if args.raw_data: is_store_raw_alignment = True is_store_raw_mutation_tree = True is_store_raw_true_tree = True if args.raw_data_alignment: is_store_raw_alignment = True if args.raw_data_mutation_tree: is_store_raw_mutation_tree = True if args.raw_data_true_tree: is_store_raw_true_tree = True print(is_store_raw_alignment) print(is_store_raw_mutation_tree) print(is_store_raw_true_tree) with utility.TemporaryDirectory( prefix="sisterbayes-", parent_dir=args.working_directory_parent, is_suppress_cleanup=args.no_cleanup) as working_directory: config_d["working_directory"] = working_directory simulator = simulate.SisterBayesSimulator( config_d=config_d, num_processes=args.num_processes, is_verbose_setup=True, package_id=package_id, is_store_raw_alignment=is_store_raw_alignment, is_store_raw_mutation_tree=is_store_raw_mutation_tree, is_store_raw_true_tree=is_store_raw_true_tree, raw_data_output_prefix=config_d["output_prefix"], raw_data_alignment_format=args.raw_data_alignment_format, raw_data_tree_format=args.raw_data_tree_format, is_debug_mode=args.debug_mode, ) main_dest_filepath = config_d["output_prefix"] + ".stats.tsv" dest = utility.universal_open(main_dest_filepath, "a" if args.append else "w") if args.params_only_file: params_only_dest_filepath = config_d[ "output_prefix"] + ".params.tsv" params_only_dest = utility.universal_open( params_only_dest_filepath, "a" if args.append else "w") else: params_only_dest = None # dest = utility.open_destput_file_for_csv_writer( # filepath=filepath, # is_append=args.append) if args.append or args.no_write_header: is_write_header = False else: is_write_header = True with dest: # writer = utility.get_csv_writer( # dest=dest, # delimiter=args.field_delimiter) try: results = simulator.execute( nreps=args.num_reps, dest=dest, results_store=None, params_only_dest=params_only_dest, is_write_header=is_write_header, ) except Exception as e: sys.stderr.write( "Traceback (most recent call last):\n {}{}\n".format( " ".join(traceback.format_tb(sys.exc_info()[2])), e)) sys.exit(1) if params_only_dest: params_only_dest.close()
def main(): parser = argparse.ArgumentParser() package_id = sisterbayes.package_id() parser.add_argument("--version", action="version", version=package_id) simulator_options = parser.add_argument_group("Configuration") simulator_options.add_argument( "configuration_filepath", metavar="CONFIGURATION-FILE", help="Path to the configuration file listing the data.") processing_options = parser.add_argument_group("Processing Options") processing_options.add_argument( "-U", "--unfolded-site-frequency-spectrum", "--derived-site-frequency-spectrum", action="store_true", default=False, help="Calculate the unfolded or derived site frequency spectrum." " Otherwise, defaults to the folded or minor site frequency" " spectrum.") processing_options.add_argument( "--calculate-single-population-site-frequency-spectrum", action="store_true", default=False, help="Calculate the single (within) population site frequency" " spectrum in addition to the joint.") processing_options.add_argument( "--no-normalize-by-site-counts", dest="normalize_by_site_counts", action="store_false", default=True, help= "Do *not* normalize frequency spectrum by number of sites in each locus." ) processing_options.add_argument( "--concatenate-loci", action="store_true", default=False, help="Collapse all loci and treat as a single locus for calculation.") processing_options.add_argument( "--concatenated-locus-label", default=None, help="If concatenating, label for the concatenated locus.") output_options = parser.add_argument_group("Output Options") output_options.add_argument( '-o', '--output-name-prefix', action='store', dest='output_name_prefix', type=str, default=None, metavar='NAME-PREFIX', help= "Prefix for output filenames (default: same as configuration filename stem)." ) output_options.add_argument( '-O', '--output-directory', action='store', dest='output_directory', type=str, default=None, metavar='DIRECTORY', help="Directory for output files (default: current working directory)." ) output_options.add_argument( "-l", "--labels", action="append", help= "Addition field/value pairs to add to the output (in format <FIELD-NAME>:value;)" ) output_options.add_argument( '--field-delimiter', type=str, default='\t', help="Delimiter string separating fields in output (default: <TAB>').") output_options.add_argument( '--summary-stats-label-prefix', type=str, default='stat', metavar='PREFIX', help= "Prefix for summar statistic field labels (default: '%(default)s').") output_options.add_argument( "--append", action="store_true", default=False, help="Append instead of overwriting output file(s).") output_options.add_argument("--no-write-header", action="store_true", default=False, help="Do not writer header row.") args = parser.parse_args() config_d = utility.CaseInsensitiveDict() utility.parse_legacy_configuration(filepath=args.configuration_filepath, config_d=config_d) config_d["output_prefix"] = utility.output_prefix( primary_source_filepath=args.configuration_filepath, output_name_prefix=args.output_name_prefix, output_directory=args.output_directory) config_d[ "is_unfolded_site_frequency_spectrum"] = args.unfolded_site_frequency_spectrum config_d[ "is_calculate_single_population_sfs"] = args.calculate_single_population_site_frequency_spectrum config_d["is_calculate_joint_population_sfs"] = True config_d["stat_label_prefix"] = args.summary_stats_label_prefix config_d["supplemental_labels"] = utility.parse_fieldname_and_value( args.labels) config_d["alignment_directory_head"] = os.path.dirname( os.path.abspath(args.configuration_filepath)) config_d["field_delimiter"] = args.field_delimiter config_d["is_concatenate_loci"] = args.concatenate_loci config_d["concatenated_locus_label"] = args.concatenated_locus_label config_d["is_normalize"] = args.normalize_by_site_counts sscalc = sumstats.SisterBayesSummaryStatsCalculator(**config_d) filepath = config_d["output_prefix"] + ".obs.sumstats.tsv" # dest = utility.open_destput_file_for_csv_writer( # filepath=filepath, # is_append=args.append) dest = utility.universal_open(filepath, "a" if args.append else "w") if args.append or args.no_write_header: is_write_header = False else: is_write_header = True with dest: # writer = utility.get_csv_writer( # dest=dest, # delimiter=args.field_delimiter) try: results = sscalc.write_summary_stats( dest=dest, results_store=None, is_write_header=is_write_header) except Exception as e: sys.stderr.write( "Traceback (most recent call last):\n {}{}\n".format( " ".join(traceback.format_tb(sys.exc_info()[2])), e)) sys.exit(1)
def summarize( self, target_data_filepath, ): if self.output_name_prefix is None: self.output_name_prefix = os.path.splitext( os.path.basename(target_data_filepath))[0] if self.output_directory is None: self.output_directory = "." self.output_directory = os.path.realpath(self.output_directory) output_prefix = os.path.join(self.output_directory, self.output_name_prefix) with utility.universal_open(target_data_filepath) as src: reader = csv.DictReader(src, delimiter=self.field_delimiter, quoting=csv.QUOTE_NONE) categorical_params = collections.OrderedDict() continuous_params = collections.OrderedDict() realized_div_time_samples = [] all_div_times = [] sp_labels = [] for row_idx, row in enumerate(reader): realized_div_time_samples.append({}) for key_idx, key in enumerate(reader.fieldnames): if key in categorical_params: categorical_params[key][row[key]] += 1 elif key in continuous_params: continuous_params[key].append(float(row[key])) else: if key in ("param.divTimeModel", "param.numDivTimes"): val = row[key] is_categorical = True else: try: val = float(row[key]) is_categorical = False except ValueError: val = row[key] is_categorical = True if is_categorical: categorical_params[key] = collections.Counter() categorical_params[key][val] += 1 else: continuous_params[key] = [val] if key.startswith("param.divTime."): sp_label = key.replace("param.divTime.", "") realized_div_time_samples[-1][ sp_label] = continuous_params[key][-1] all_div_times.append(val) if row_idx == 0: sp_labels.append(sp_label) ### EXPERIMENTAL ### # categorical_params["param.effectiveDivTimeModel"] = self.cluster_by_relative_difference_threshold( # sp_labels=sp_labels, # realized_div_time_samples=realized_div_time_samples, # all_div_times=all_div_times, # relative_difference_threshold=0.01) if self.cluster_criteria is not None: if self.cluster_criteria == "bin_size": cluster_results = self.cluster_by_bin_size( sp_labels=sp_labels, realized_div_time_samples=realized_div_time_samples, all_div_times=all_div_times, bin_size=self.cluster_criteria_value) elif self.cluster_criteria == "num_bins": cluster_results = self.cluster_by_num_bins( sp_labels=sp_labels, realized_div_time_samples=realized_div_time_samples, all_div_times=all_div_times, num_bins=self.cluster_criteria_value) elif self.cluster_criteria == "absolute_difference_threshold": cluster_results = self.cluster_by_absolute_difference_threshold( sp_labels=sp_labels, realized_div_time_samples=realized_div_time_samples, absolute_difference_threshold=self. cluster_criteria_value) elif self.cluster_criteria == "relative_difference_threshold": cluster_results = self.cluster_by_relative_difference_threshold( sp_labels=sp_labels, realized_div_time_samples=realized_div_time_samples, all_div_times=all_div_times, relative_difference_threshold=self. cluster_criteria_value) else: raise ValueError( "Unrecognized cluster criteria: '{}'".format( self.cluster_criteria)) categorical_params[ "param.effectiveDivTimeModel"] = cluster_results ### EXPERIMENTAL ### with utility.universal_open( output_prefix + ".summary.continuous.tsv", "w") as dest: row_results = collections.OrderedDict() for param_idx, param_name in enumerate(continuous_params): values = continuous_params[param_name] row_results["param"] = param_name summary = calclib.summarize(values) row_results["mean"] = summary["mean"] row_results["var"] = summary["var"] row_results["sd"] = summary["sd"] row_results["min"] = summary["range"][0] row_results["max"] = summary["range"][1] try: row_results["hpd5"] = summary["hpd95"][0] row_results["hpd95"] = summary["hpd95"][1] except TypeError: row_results["hpd5"] = "NA" row_results["hpd95"] = "NA" try: row_results["quant5"] = summary["quant_5_95"][0] row_results["quant95"] = summary["quant_5_95"][1] except TypeError: row_results["quant5"] = "NA" row_results["quant95"] = "NA" if param_idx == 0: dest.write( self.field_delimiter.join(row_results.keys()) + "\n") dest.write( self.field_delimiter.join( "{}".format(v) for v in row_results.values()) + "\n") for param_idx, param_name in enumerate(categorical_params): with utility.universal_open( output_prefix + ".summary.{:02d}.{}.tsv".format( param_idx + 1, param_name), "w") as dest: param_counter = categorical_params[param_name] total = float(sum(param_counter.values())) for category_idx, (category_name, category_count) in enumerate( param_counter.most_common()): row_results = collections.OrderedDict() row_results["label"] = category_name row_results["freq"] = category_count / total row_results["count"] = category_count if category_idx == 0: dest.write( self.field_delimiter.join(row_results.keys()) + "\n") dest.write( self.field_delimiter.join( "{}".format(v) for v in row_results.values()) + "\n")
def read_files(self, filepaths): for file_idx, filepath in enumerate(filepaths): self.run_logger.info("Reading file {} of {}: '{}'".format( file_idx + 1, len(filepaths), filepath)) with utility.universal_open(filepath) as src: self._read_file(src)