def write_posterior(self, output_prefix, target_data_filepath):
     if output_prefix is None:
         output_prefix = os.path.splitext(
             os.path.basename(target_data_filepath))[0] + ".posterior"
     with utility.universal_open(target_data_filepath) as src:
         reader = csv.DictReader(src,
                                 delimiter=self.field_delimiter,
                                 quoting=csv.QUOTE_NONE)
         for row_idx, row in enumerate(reader):
             assert len(row) == len(reader.fieldnames)
             target_stat_values = []
             target_other_values = []
             for key_idx, key in enumerate(
                     self.all_fieldnames
             ):  # keys must be read in same order!
                 if key not in row:
                     continue
                 if not self.is_suppress_checks:
                     if key not in self.stat_fieldnames_check and key not in self.other_fieldname_check:
                         raise ValueError(
                             "File '{}', target {}, column {}: field '{}' not recognized"
                             .format(target_data_filepath, target_idx + 1,
                                     key_idx + 1, key))
                 if key.startswith(self.stats_field_prefix):
                     target_stat_values.append(float(row[key]))
                 else:
                     target_other_values.append(row[key])
             if self.rejection_criteria_type == "distance":
                 posterior_indexes = self.filter_by_distance(
                     target_stat_values=target_stat_values,
                     max_distance=self.rejection_criteria_value)
             else:
                 if self.rejection_criteria_type == "num":
                     num_to_retain = self.rejection_criteria_value
                 elif self.rejection_criteria_type == "proportion":
                     num_to_retain = int(self.rejection_criteria_value *
                                         len(target_stat_values))
                 posterior_indexes = self.closest_values_indexes(
                     target_stat_values=target_stat_values,
                     num_to_retain=num_to_retain,
                 )
             dest = utility.universal_open(
                 output_prefix + ".{}.tsv".format(row_idx + 1), "w")
             dest.write(
                 self.field_delimiter.join(
                     str(v) for v in self.other_fieldnames))
             if self.is_output_summary_stats:
                 dest.write(
                     self.field_delimiter.join(
                         str(v) for v in self.stat_fieldnames))
             dest.write("\n")
             for distance, index in posterior_indexes:
                 dest.write(
                     self.field_delimiter.join(
                         str(v) for v in self.other_values[index]))
                 if self.is_output_summary_stats:
                     dest.write(
                         self.field_delimiter.join(
                             str(v) for v in self.stat_values[index]))
                 dest.write("\n")
示例#2
0
 def _generate_parameter_file(
     self,
     fsc2_config_d,
 ):
     assert self.parameter_filepath
     with utility.universal_open(
             os.path.join(self.working_directory, self.parameter_filepath),
             "w") as dest:
         self._write_parameter_configuration(
             dest=dest,
             fsc2_config_d=fsc2_config_d,
         )
示例#3
0
 def _parse_deme_site_frequencies(self, filepath, field_name_prefix,
                                  results_d):
     with utility.universal_open(filepath) as src:
         lines = src.read().split("\n")
         assert len(lines) == 4 and lines[3] == ""
         header_row = lines[1].split("\t")
         results_d_row = lines[2].split("\t")
         assert len(header_row) == len(results_d_row)
         for key, val in zip(header_row, results_d_row):
             if not val:
                 continue
             results_d["{}.{}".format(field_name_prefix, key)] = float(val)
     return results_d
示例#4
0
 def _parse_joint_site_frequencies(self, filepath, field_name_prefix,
                                   results_d):
     with utility.universal_open(filepath) as src:
         lines = src.read().split("\n")
         col_keys = lines[1].split("\t")[1:]
         row_idx = 0
         for line in lines[2:]:
             if not line:
                 continue
             cols = line.split("\t")
             assert len(cols) - 1 == len(col_keys)
             row_key = cols[0]
             col_idx = 0
             for col_key, val in zip(col_keys, cols[1:]):
                 # results_d["{}.{}.{}".format(field_name_prefix, row_key, col_key)] = float(val)
                 results_d["{}.{}.{}".format(field_name_prefix, row_idx,
                                             col_idx)] = float(val)
                 col_idx += 1
             row_idx += 1
     return results_d
 def read_simulated_data(self, filepaths):
     for filepath in filepaths:
         self.run_logger.info(
             "Reading simulation file: '{}'".format(filepath))
         with utility.universal_open(filepath) as src:
             reader = csv.DictReader(src,
                                     delimiter=self.field_delimiter,
                                     quoting=csv.QUOTE_NONE)
             for row_idx, row in enumerate(reader):
                 if self.logging_frequency and row_idx > 0 and row_idx % self.logging_frequency == 0:
                     self.run_logger.info(
                         "- Processing row {}".format(row_idx + 1))
                 if self.all_fieldnames is None:
                     self.all_fieldnames = list(reader.fieldnames)
                     self.stat_fieldnames = []
                     self.other_fieldnames = []
                     for field in reader.fieldnames:
                         if field.startswith(self.stats_field_prefix):
                             self.stat_fieldnames.append(field)
                         else:
                             self.other_fieldnames.append(field)
                     self.stat_fieldnames_check = set(self.stat_fieldnames)
                     self.other_fieldname_check = set(self.other_fieldnames)
                 row_stat_values = []
                 row_other_values = []
                 for key_idx, key in enumerate(
                         self.all_fieldnames
                 ):  # keys must be read in same order!
                     if not self.is_suppress_checks:
                         if key not in self.stat_fieldnames_check and key not in self.other_fieldname_check:
                             raise ValueError(
                                 "File '{}', row {}, column {}: field '{}' not recognized"
                                 .format(filepath, row_idx + 1, key_idx + 1,
                                         key))
                     if key.startswith(self.stats_field_prefix):
                         row_stat_values.append(float(row[key]))
                     else:
                         row_other_values.append(row[key])
                 # assert len(row) == len(row_stat_values) + len(row_other_values)
                 self.stat_values.append(row_stat_values)
                 self.other_values.append(row_other_values)
示例#6
0
 def write_results(self):
     for file_idx, file_info in enumerate(self.file_infos):
         output_filepath = self.compose_output_path_f(
             file_info.filepath, file_idx)
         self.run_logger.info("Writing file {} of {}: '{}'".format(
             file_idx + 1, len(self.file_infos), output_filepath))
         with utility.universal_open(output_filepath, "w") as dest:
             writer = utility.get_csv_writer(
                 dest=dest,
                 fieldnames=file_info.fieldnames,
                 delimiter=self.field_delimiter,
                 restval=self.missing_data_value,
             )
             writer.writeheader()
             for data_row_idx in range(*file_info.data_row_idx_range):
                 if self.logging_frequency and data_row_idx > 0 and (
                         data_row_idx % self.logging_frequency) == 0:
                     self.run_logger.info(
                         "- Writing row {}".format(data_row_idx + 1))
                 row = {}
                 for field_name in file_info.fieldnames:
                     row[field_name] = self.fields[field_name][data_row_idx]
                 writer.writerow(row)
def main():
    parser = argparse.ArgumentParser()
    package_id = spectrasophy.package_id()
    parser.add_argument("--version", action="version", version=package_id)

    simulator_options = parser.add_argument_group("Simulation Configuration")
    simulator_options.add_argument(
        "configuration_filepath",
        metavar="CONFIGURATION-FILE",
        help="Path to file defining the simulation model and parameters.")
    output_options = parser.add_argument_group("Output Options")
    output_options.add_argument(
        '-o',
        '--output-name-prefix',
        action='store',
        dest='output_name_prefix',
        type=str,
        default=None,
        metavar='NAME-PREFIX',
        help=
        "Prefix for output filenames (default: same as configuration filename stem)."
    )
    output_options.add_argument(
        '-O',
        '--output-directory',
        action='store',
        dest='output_directory',
        type=str,
        default=None,
        metavar='DIRECTORY',
        help="Directory for output files (default: current working directory)."
    )
    output_options.add_argument(
        "-U",
        "--unfolded-site-frequency-spectrum",
        "--derived-site-frequency-spectrum",
        action="store_true",
        default=False,
        help="Calculate the unfolded or derived site frequency spectrum."
        " Otherwise, defaults to the folded or minor site frequency"
        " spectrum.")
    output_options.add_argument(
        "--infinite-sites-model",
        action="store_true",
        default=False,
        help="Use infinite sites model instead of finite sites.")
    output_options.add_argument(
        "--calculate-single-population-site-frequency-spectrum",
        action="store_true",
        default=False,
        help="Calculate the single (within) population site frequency"
        " spectrum in addition to the joint.")
    output_options.add_argument(
        "-l",
        "--labels",
        action="append",
        help=
        "Addition field/value pairs to add to the output (in format <FIELD-NAME>:value;)"
    )
    output_options.add_argument(
        '--field-delimiter',
        type=str,
        default='\t',
        help="Delimiter string separating fields in output (default: <TAB>').")
    output_options.add_argument(
        '--summary-stats-label-prefix',
        type=str,
        default='stat',
        metavar='PREFIX',
        help=
        "Prefix for summar statistic field labels (default: '%(default)s').")
    output_options.add_argument(
        "--include-model-id-field",
        action="store_true",
        default=False,
        help=
        "Include a 'model.id' field (with same value as 'param.divTimeModel' field) in output."
    )
    output_options.add_argument(
        "--append",
        action="store_true",
        default=False,
        help="Append instead of overwriting output file(s).")
    output_options.add_argument("--no-write-header",
                                action="store_true",
                                default=False,
                                help="Do not writer header row.")

    run_options = parser.add_argument_group("Run Options")
    run_options.add_argument(
        "-n",
        "--num-reps",
        type=int,
        default=1,
        help="Number of replicates (default: %(default)s).")
    run_options.add_argument(
        "-m",
        "--num-processes",
        default=1,
        type=int,
        help="Number of processes/CPU to run (default: %(default)s).")
    run_options.add_argument("-z",
                             "--random-seed",
                             default=None,
                             help="Seed for random number generator engine.")
    run_options.add_argument(
        "--log-frequency",
        default=None,
        type=int,
        help=
        "Frequency that background progress messages get written to the log (0: do not log informational messages)."
    )
    run_options.add_argument(
        "--file-logging-level",
        default="none",
        choices=[
            "debug",
            "info",
            "warning",
            "error",
            "critical",
            "none",
        ],
        help="Message level threshold for screen logs (default: %(default)s).")
    run_options.add_argument(
        "--stderr-logging-level",
        default="info",
        choices=[
            "debug",
            "info",
            "warning",
            "error",
            "critical",
            "none",
        ],
        help="Message level threshold for screen logs (default: %(default)s).")
    run_options.add_argument(
        '-w',
        '--working-directory-parent',
        action='store',
        type=str,
        default=None,
        help="Directory within which to create temporary directories and files."
    )
    run_options.add_argument("--no-cleanup",
                             action="store_true",
                             default=False,
                             help="Do not clean-up temporary files.")
    run_options.add_argument("--debug-mode",
                             action="store_true",
                             default=False,
                             help="Run in debugging mode.")

    fsc2_options = parser.add_argument_group("FastSimCoal2 Options")
    fsc2_options.add_argument(
        "--fsc2-path",
        metavar="FSC2-PATH",
        default="fsc25",
        help="Path to FastsimCoal2 application (default: %(default)s).")

    args = parser.parse_args()

    config_d = {}
    utility.parse_legacy_configuration(filepath=args.configuration_filepath,
                                       config_d=config_d)
    config_d["output_prefix"] = utility.output_prefix(
        primary_source_filepath=args.configuration_filepath,
        output_name_prefix=args.output_name_prefix,
        output_directory=args.output_directory)
    if args.log_frequency is None:
        config_d["logging_frequency"] = int(args.num_reps / 10.0)
    elif args.log_frequency == 0:
        config_d["logging_frequency"] = None
    else:
        config_d["logging_frequency"] = args.log_frequency
    config_d["fsc2_path"] = args.fsc2_path
    config_d["file_logging_level"] = args.file_logging_level
    config_d["standard_error_logging_level"] = args.stderr_logging_level
    # config_d["log_to_file"] = args.log_to_file
    # config_d["log_to_stderr"] = args.log_to_stderr
    config_d[
        "is_unfolded_site_frequency_spectrum"] = args.unfolded_site_frequency_spectrum
    config_d[
        "is_calculate_single_population_sfs"] = args.calculate_single_population_site_frequency_spectrum
    config_d["is_calculate_joint_population_sfs"] = True
    config_d["is_infinite_sites_model"] = args.infinite_sites_model
    config_d["stat_label_prefix"] = args.summary_stats_label_prefix
    config_d["supplemental_labels"] = utility.parse_fieldname_and_value(
        args.labels)
    config_d["field_delimiter"] = args.field_delimiter
    config_d["is_include_model_id_field"] = args.include_model_id_field
    with utility.TemporaryDirectory(
            prefix="spectrasophy-",
            parent_dir=args.working_directory_parent,
            is_suppress_cleanup=args.no_cleanup) as working_directory:
        config_d["working_directory"] = working_directory
        simulator = simulate.SpectrasophySimulator(
            config_d=config_d,
            num_processes=args.num_processes,
            is_verbose_setup=True,
            package_id=package_id,
        )
        filepath = config_d["output_prefix"] + ".sumstats.tsv"
        dest = utility.universal_open(filepath, "a" if args.append else "w")
        # dest = utility.open_destput_file_for_csv_writer(
        #         filepath=filepath,
        #         is_append=args.append)
        if args.append or args.no_write_header:
            is_write_header = False
        else:
            is_write_header = True
        with dest:
            # writer = utility.get_csv_writer(
            #         dest=dest,
            #         delimiter=args.field_delimiter)
            try:
                results = simulator.execute(nreps=args.num_reps,
                                            dest=dest,
                                            results_store=None,
                                            is_write_header=is_write_header)
            except Exception as e:
                sys.stderr.write(
                    "Traceback (most recent call last):\n  {}{}\n".format(
                        "  ".join(traceback.format_tb(sys.exc_info()[2])), e))
                sys.exit(1)
示例#8
0
 def read_files(self, filepaths):
     for file_idx, filepath in enumerate(filepaths):
         self.run_logger.info("Reading file {} of {}: '{}'".format(
             file_idx + 1, len(filepaths), filepath))
         with utility.universal_open(filepath) as src:
             self._read_file(src)
 def summarize(
     self,
     target_data_filepath,
 ):
     with utility.universal_open(target_data_filepath) as src:
         reader = csv.DictReader(src,
                                 delimiter=self.field_delimiter,
                                 quoting=csv.QUOTE_NONE)
         categorical_params = collections.OrderedDict()
         continuous_params = collections.OrderedDict()
         for row_idx, row in enumerate(reader):
             for key_idx, key in enumerate(reader.fieldnames):
                 if key in categorical_params:
                     categorical_params[key][row[key]] += 1
                 elif key in continuous_params:
                     continuous_params[key].append(float(row[key]))
                 else:
                     if key in ("param.DivTimeModel", "param.numDivTimes"):
                         val = row[key]
                         is_categorical = True
                     else:
                         try:
                             val = float(row[key])
                             is_categorical = False
                         except ValueError:
                             val = row[key]
                             is_categorical = True
                     if is_categorical:
                         categorical_params[key] = collections.Counter()
                         categorical_params[key][val] += 1
                     else:
                         continuous_params[key] = [val]
         output_prefix = os.path.splitext(
             os.path.basename(target_data_filepath))[0]
         with utility.universal_open(
                 output_prefix + ".summary.continuous.tsv", "w") as dest:
             row_results = collections.OrderedDict()
             for param_idx, param_name in enumerate(continuous_params):
                 values = continuous_params[param_name]
                 row_results["param"] = param_name
                 summary = statistics.summarize(values)
                 row_results["mean"] = summary["mean"]
                 row_results["var"] = summary["var"]
                 row_results["sd"] = summary["sd"]
                 row_results["min"] = summary["range"][0]
                 row_results["max"] = summary["range"][1]
                 row_results["hpd5"] = summary["hpd95"][0]
                 row_results["hpd95"] = summary["hpd95"][1]
                 try:
                     row_results["quant5"] = summary["quant_5_95"][0]
                     row_results["quant95"] = summary["quant_5_95"][1]
                 except TypeError:
                     row_results["quant5"] = "NA"
                     row_results["quant95"] = "NA"
                 if param_idx == 0:
                     dest.write(
                         self.field_delimiter.join(row_results.keys()) +
                         "\n")
                 dest.write(
                     self.field_delimiter.join(
                         "{}".format(v)
                         for v in row_results.values()) + "\n")
         for param_idx, param_name in enumerate(categorical_params):
             with utility.universal_open(
                     output_prefix + ".summary.{:02d}.{}.tsv".format(
                         param_idx + 1, param_name), "w") as dest:
                 param_counter = categorical_params[param_name]
                 total = float(sum(param_counter.values()))
                 for category_idx, (category_name,
                                    category_count) in enumerate(
                                        param_counter.most_common()):
                     row_results = collections.OrderedDict()
                     row_results["label"] = category_name
                     row_results["freq"] = category_count / total
                     row_results["count"] = category_count
                     if category_idx == 0:
                         dest.write(
                             self.field_delimiter.join(row_results.keys()) +
                             "\n")
                     dest.write(
                         self.field_delimiter.join(
                             "{}".format(v)
                             for v in row_results.values()) + "\n")
示例#10
0
def main():
    parser = argparse.ArgumentParser()
    package_id = spectrasophy.package_id()
    parser.add_argument("--version", action="version", version=package_id)

    simulator_options = parser.add_argument_group("Configuration")
    simulator_options.add_argument("configuration_filepath",
            metavar="CONFIGURATION-FILE",
            help="Path to the configuration file listing the data.")
    output_options = parser.add_argument_group("Output Options")
    output_options.add_argument('-o', '--output-name-prefix',
            action='store',
            dest='output_name_prefix',
            type=str,
            default=None,
            metavar='NAME-PREFIX',
            help="Prefix for output filenames (default: same as configuration filename stem).")
    output_options.add_argument('-O', '--output-directory',
            action='store',
            dest='output_directory',
            type=str,
            default=None,
            metavar='DIRECTORY',
            help="Directory for output files (default: current working directory).")
    output_options.add_argument(
            "-U",
            "--unfolded-site-frequency-spectrum",
            "--derived-site-frequency-spectrum",
            action="store_true",
            default=False,
            help="Calculate the unfolded or derived site frequency spectrum."
            " Otherwise, defaults to the folded or minor site frequency"
            " spectrum."
            )
    output_options.add_argument(
            "--calculate-single-population-site-frequency-spectrum",
            action="store_true",
            default=False,
            help="Calculate the single (within) population site frequency"
            " spectrum in addition to the joint."
            )
    output_options.add_argument("-l", "--labels",
            action="append",
            help="Addition field/value pairs to add to the output (in format <FIELD-NAME>:value;)")
    output_options.add_argument('--field-delimiter',
            type=str,
            default='\t',
            help="Delimiter string separating fields in output (default: <TAB>').")
    output_options.add_argument('--summary-stats-label-prefix',
            type=str,
            default='stat',
            metavar='PREFIX',
            help="Prefix for summar statistic field labels (default: '%(default)s').")
    output_options.add_argument( "--append",
            action="store_true",
            default=False,
            help="Append instead of overwriting output file(s).")
    output_options.add_argument( "--no-write-header",
            action="store_true",
            default=False,
            help="Do not writer header row.")

    args = parser.parse_args()

    config_d = {}
    utility.parse_legacy_configuration(
            filepath=args.configuration_filepath,
            config_d=config_d)
    config_d["output_prefix"] = utility.output_prefix(
            primary_source_filepath=args.configuration_filepath,
            output_name_prefix=args.output_name_prefix,
            output_directory=args.output_directory)
    config_d["is_unfolded_site_frequency_spectrum"] = args.unfolded_site_frequency_spectrum
    config_d["is_calculate_single_population_sfs"] = args.calculate_single_population_site_frequency_spectrum
    config_d["is_calculate_joint_population_sfs"] = True
    config_d["stat_label_prefix"] = args.summary_stats_label_prefix
    config_d["supplemental_labels"] = utility.parse_fieldname_and_value(args.labels)
    config_d["alignment_directory_head"] = os.path.dirname(os.path.abspath(args.configuration_filepath))
    config_d["field_delimiter"] = args.field_delimiter

    sscalc = sumstats.SpectrasophySummaryStatsCalculator(**config_d)
    filepath = config_d["output_prefix"] + ".obs.sumstats.tsv"
    # dest = utility.open_destput_file_for_csv_writer(
    #         filepath=filepath,
    #         is_append=args.append)
    dest = utility.universal_open(filepath, "a" if args.append else "w")
    if args.append or args.no_write_header:
        is_write_header = False
    else:
        is_write_header = True
    with dest:
        # writer = utility.get_csv_writer(
        #         dest=dest,
        #         delimiter=args.field_delimiter)
        try:
            results = sscalc.write_summary_stats(
                    dest=dest,
                    results_store=None,
                    is_write_header=is_write_header)
        except Exception as e:
            sys.stderr.write("Traceback (most recent call last):\n  {}{}\n".format(
                "  ".join(traceback.format_tb(sys.exc_info()[2])),
                e))
            sys.exit(1)