Exemplo n.º 1
0
 def _parse_joint_site_frequencies(self, filepath, field_name_prefix,
                                   is_normalize_by_site_counts,
                                   lineage_pair, locus_definition,
                                   results_d):
     with utility.universal_open(filepath) as src:
         lines = src.read().split("\n")
         col_keys = [
             c for c in lines[1].split("\t")[1:]
             if "sites with multiple" not in c
         ]
         row_idx = 0
         for line in lines[2:]:
             if not line:
                 continue
             cols = line.split("\t")
             if len(cols) - 1 != len(col_keys):
                 raise ValueError(
                     "Row {}: Expecting {} columns, but found {}: {}".
                     format(row_idx + 1, len(col_keys),
                            len(cols) - 1, cols))
             assert len(cols) - 1 == len(col_keys)
             row_key = cols[0]
             col_idx = 0
             for col_key, val in zip(col_keys, cols[1:]):
                 # results_d["{}.{}.{}".format(field_name_prefix, row_key, col_key)] = float(val)
                 val = float(val)
                 if is_normalize_by_site_counts:
                     val = val / locus_definition.num_sites
                 results_d["{}.{}.{}".format(field_name_prefix, row_idx,
                                             col_idx)] = val
                 col_idx += 1
             row_idx += 1
     return results_d
Exemplo n.º 2
0
 def _generate_parameter_file(
     self,
     fsc2_config_d,
 ):
     assert self.parameter_filepath
     with utility.universal_open(
             os.path.join(self.working_directory, self.parameter_filepath),
             "w") as dest:
         self._write_parameter_configuration(
             dest=dest,
             fsc2_config_d=fsc2_config_d,
         )
Exemplo n.º 3
0
 def _parse_raw_results_dna(self):
     data_dict = collections.OrderedDict()
     decodings = list(itertools.permutations("ACGT", 4))
     column_decodings = {}
     with utility.universal_open(self.arlequin_filepath) as src:
         idx = 0
         first_pop_max_ind_id = 0
         in_alignment = False
         for row in src:
             row = row[:-1]  # chomp terminating \n
             if in_alignment:
                 if row == "":
                     if idx == 2:
                         break
                     else:
                         in_alignment = False
                 else:
                     try:
                         x1, x2, encoded_data = row.split("\t")
                         decoded_data = []
                         for col_idx, ch in enumerate(encoded_data):
                             try:
                                 decoding_lookup = column_decodings[col_idx]
                             except KeyError:
                                 decoding_lookup = self.rng.choice(
                                     decodings)
                                 column_decodings[col_idx] = decoding_lookup
                             decoded_data.append(decoding_lookup[int(ch)])
                         pop_id, ind_id = [
                             int(label_part) for label_part in x1.split("_")
                         ]
                         # this ugly hack is because fastsimcoal inconsistently numbers
                         # individuals on trees vs sequences ... even
                         # assuming there is a correspondence
                         if pop_id == 1:
                             first_pop_max_ind_id = max(
                                 first_pop_max_ind_id, ind_id)
                         else:
                             ind_id += first_pop_max_ind_id
                         taxon_label = self._compose_raw_data_taxon_label(
                             population_id=pop_id, individual_id=ind_id)
                         # taxon_label = self._compose_raw_data_taxon_label(
                         #         population_id=idx,
                         #         individual_id=x1)
                         data_dict[taxon_label] = "".join(decoded_data)
                     except IndexError:
                         raise
             elif "SampleData=" in row:
                 idx += 1
                 in_alignment = True
     return dendropy.DnaCharacterMatrix.from_dict(data_dict)
Exemplo n.º 4
0
 def process(self, target_data_filepath, priors_data_filepaths,
             output_name_prefix, output_directory, output_suffix):
     if output_name_prefix is None:
         output_name_prefix = os.path.splitext(
             os.path.basename(target_data_filepath))[0]
     if output_suffix is None:
         output_suffix = ""
     else:
         output_suffix = "." + output_suffix
     with utility.universal_open(target_data_filepath) as src:
         target_data_reader = csv.DictReader(src,
                                             delimiter=self.field_delimiter,
                                             quoting=csv.QUOTE_NONE)
         for target_row_idx, target_row in enumerate(target_data_reader):
             if target_row_idx == 0:
                 self.stat_fieldnames, self.non_stat_fieldnames = self.extract_stat_fieldnames(
                     target_data_reader.fieldnames)
                 self.stat_fieldnames_set = set(self.stat_fieldnames)
             self.run_logger.info(
                 "Scoring target data {}".format(target_row_idx + 1))
             target_data_vector = self.extract_stats_data_vector_from_csv_row(
                 target_row)
             posteriors_filepath = os.path.join(
                 output_directory,
                 "{}.posterior.{:03d}.samples{}.tsv".format(
                     output_name_prefix, target_row_idx + 1, output_suffix))
             self.accept_reject(target_data_vector=target_data_vector,
                                priors_data_filepaths=priors_data_filepaths,
                                output_filepath=posteriors_filepath)
             if self.is_output_target_params:
                 target_params_filepath = os.path.join(
                     output_directory,
                     "{}.posterior.{:03d}.target{}.tsv".format(
                         output_name_prefix, target_row_idx + 1,
                         output_suffix))
                 with open(target_params_filepath, "w") as target_params_f:
                     target_params_f.write(
                         self.field_delimiter.join(
                             self.non_stat_fieldnames))
                     target_params_f.write("\n")
                     target_params_f.write(
                         self.field_delimiter.join(
                             str(target_row[k])
                             for k in self.non_stat_fieldnames))
                     target_params_f.write("\n")
Exemplo n.º 5
0
 def _parse_deme_site_frequencies(self, filepath, field_name_prefix,
                                  is_normalize_by_site_counts, lineage_pair,
                                  locus_definition, results_d):
     with utility.universal_open(filepath) as src:
         lines = src.read().split("\n")
         assert len(lines) == 4 and lines[3] == ""
         header_row = lines[1].split("\t")
         results_d_row = lines[2].split("\t")
         assert len(header_row) == len(results_d_row)
         for key, val in zip(header_row, results_d_row):
             if not val:
                 continue
             key = "{}.{}".format(field_name_prefix, key)
             val = float(val)
             if is_normalize_by_site_counts:
                 val = val / locus_definition.num_sites
             results_d[key] = val
     return results_d
Exemplo n.º 6
0
 def write_results(self):
     for file_idx, file_info in enumerate(self.file_infos):
         output_filepath = self.compose_output_path_f(
             file_info.filepath, file_idx)
         self.run_logger.info("Writing file {} of {}: '{}'".format(
             file_idx + 1, len(self.file_infos), output_filepath))
         with utility.universal_open(output_filepath, "w") as dest:
             writer = utility.get_csv_writer(
                 dest=dest,
                 fieldnames=file_info.fieldnames,
                 delimiter=self.field_delimiter,
                 restval=self.missing_data_value,
             )
             writer.writeheader()
             for data_row_idx in range(*file_info.data_row_idx_range):
                 if self.logging_frequency and data_row_idx > 0 and (
                         data_row_idx % self.logging_frequency) == 0:
                     self.run_logger.info(
                         "- Writing row {}".format(data_row_idx + 1))
                 row = {}
                 for field_name in file_info.fieldnames:
                     row[field_name] = self.fields[field_name][data_row_idx]
                 writer.writerow(row)
Exemplo n.º 7
0
    def accept_reject(self, target_data_vector, priors_data_filepaths,
                      output_filepath):
        if self.rejection_criteria_type == "num":
            num_to_retain = self.rejection_criteria_value
        else:
            num_to_retain = None
        dest = utility.universal_open(output_filepath, "w")
        all_prior_fieldnames = []
        all_prior_fieldnames_set = None
        accepted_heap = []
        for fidx, priors_data_filepath in enumerate(priors_data_filepaths):
            self.run_logger.info(
                "Reading simulation file {} of {}: '{}'".format(
                    fidx + 1, len(priors_data_filepaths),
                    priors_data_filepath))
            with utility.universal_open(priors_data_filepath) as src:
                priors_data_reader = csv.DictReader(
                    src,
                    delimiter=self.field_delimiter,
                    quoting=csv.QUOTE_NONE)
                for row_idx, row in enumerate(priors_data_reader):
                    if self.logging_frequency and row_idx > 0 and row_idx % self.logging_frequency == 0:
                        self.run_logger.info(
                            "Reading simulation file {} of {}, row {}".format(
                                fidx + 1, len(priors_data_filepaths),
                                row_idx + 1))
                    if row_idx == 0:
                        if fidx == 0:
                            all_prior_fieldnames = list(
                                priors_data_reader.fieldnames)
                            all_prior_fieldnames_set = set(
                                all_prior_fieldnames)
                            current_file_stat_fieldnames = set(
                                self.extract_stat_fieldnames(
                                    priors_data_reader.fieldnames)[0])
                            s1 = current_file_stat_fieldnames - self.stat_fieldnames_set
                            if s1:
                                raise ValueError(
                                    "File '{}': Following summary statistics fields not found in target: {}"
                                    .format(priors_data_filepath,
                                            ", ".join(s1)))
                            s2 = self.stat_fieldnames_set - current_file_stat_fieldnames
                            if s2:
                                raise ValueError(
                                    "File '{}': Following summary statistics fields given in target but not found here: {}"
                                    .format(priors_data_filepath,
                                            ", ".join(s2)))
                            header_row = []
                            for fnidx, fn in enumerate(all_prior_fieldnames):
                                if self.is_write_summary_stats or fn not in self.stat_fieldnames_set:
                                    header_row.append(fn)
                            if self.is_write_rejection_score:
                                header_row.append(
                                    self.distance_score_fieldname)
                                header_row.append(
                                    self.normalized_distance_score_fieldname)
                            dest.write("{}\n".format(
                                self.field_delimiter.join(header_row)))
                        else:
                            current_file_fieldnames = set(
                                priors_data_reader.fieldnames)
                            s1 = current_file_fieldnames - all_prior_fieldnames_set
                            if s1:
                                raise ValueError(
                                    "File '{}': Following fields found, but not found in previous files: {}"
                                    .format(priors_data_filepath,
                                            ", ".join(s1)))
                            s2 = all_prior_fieldnames_set - current_file_fieldnames
                            if s2:
                                raise ValueError(
                                    "File '{}': Following fields found in previous files, but not found here: {}"
                                    .format(priors_data_filepath,
                                            ", ".join(s2)))
                    try:
                        prior_data_vector = self.extract_stats_data_vector_from_csv_row(
                            row)
                    except SisterBayesRejectorStatsVectorValueException:
                        if self.is_ignore_invalid_priors_data_vectors:
                            continue
                        else:
                            raise
                    except SisterBayesRejectorStatsVectorSizeException:
                        if self.is_ignore_invalid_priors_data_vectors:
                            continue
                        else:
                            raise
                    distance_score = self.euclidean_distance(
                        target_data_vector, prior_data_vector)

                    row_values = self.field_delimiter.join(
                        row[fn] for fn in priors_data_reader.fieldnames
                        if self.is_write_summary_stats
                        or fn not in self.stat_fieldnames_set)
                    if self.is_write_rejection_score:
                        row_values = "{}{}{}".format(row_values,
                                                     self.field_delimiter,
                                                     distance_score)
                        # Normalize by number of comparisons
                        # How do we get this?
                        # Consider the following vectors:
                        #   > x1 = c(3.1, 3.1, 3.1)
                        #   > x2 = c(5.1, 5.1, 5.1)
                        #   > y1 = c(3.1, 3.1, 3.1, 3.1, 3.1)
                        #   > y2 = c(5.1, 5.1, 5.1, 5.1, 5.1)
                        # The naive/raw Euclidean distances are different, due to the different number of comparisons:
                        #   > sqrt(sum((x2-x1)**2))
                        #   [1] 3.464102
                        #   > sqrt(sum((y2-y1)**2))
                        #   [1] 4.472136
                        # But dividing the be sqrt of the length of the vectors makes them equal:
                        #   > sqrt(sum((x2-x1)**2)) / sqrt(3)
                        #   [1] 2
                        #   > sqrt(sum((y2-y1)**2)) / sqrt(5)
                        #   [1] 2
                        normalized_distance_score = distance_score / math.sqrt(
                            len(target_data_vector))
                        row_values = "{}{}{}".format(
                            row_values, self.field_delimiter,
                            normalized_distance_score)
                    heap_score = -1 * (distance_score)
                    heap_entry = (heap_score, row_values)
                    if self.rejection_criteria_type == "distance":
                        if distance_score <= self.rejection_criteria_value:
                            accepted_heap.append(heap_entry)
                    elif self.rejection_criteria_type == "num":
                        if len(accepted_heap) < num_to_retain:
                            accepted_heap.append(heap_entry)
                            if len(accepted_heap) == num_to_retain:
                                heapq.heapify(accepted_heap)
                        else:
                            heapq.heappushpop(accepted_heap, heap_entry)
                    else:
                        raise NotImplementedError(self.rejection_criteria_type)
                    # for fnidx, fn in enumerate(all_prior_fieldnames):
                    #     value = row[fn]
                    #     if self.is_write_summary_stats or fn not in self.stat_fieldnames_set:
                    #         dest.write("{}{}".format(value, self.field_delimiter))
                    # dest.write("{}\n".format(distance))
        accepted_heap.sort(reverse=True)
        for hidx, heap_entry in enumerate(accepted_heap):
            heap_entry = accepted_heap[hidx]
            dest.write(heap_entry[1])
            dest.write("\n")
        dest.flush()
        dest.close()
def main():
    parser = argparse.ArgumentParser()
    package_id = sisterbayes.package_id()
    parser.add_argument("--version", action="version", version=package_id)

    simulator_options = parser.add_argument_group("Simulation Configuration")
    simulator_options.add_argument(
        "configuration_filepath",
        metavar="CONFIGURATION-FILE",
        help="Path to file defining the simulation model and parameters.")
    output_options = parser.add_argument_group("Output Options")
    output_options.add_argument(
        '-o',
        '--output-name-prefix',
        action='store',
        dest='output_name_prefix',
        type=str,
        default=None,
        metavar='NAME-PREFIX',
        help=
        "Prefix for output filenames (default: same as configuration filename stem)."
    )
    output_options.add_argument(
        '-O',
        '--output-directory',
        action='store',
        dest='output_directory',
        type=str,
        default=None,
        metavar='DIRECTORY',
        help="Directory for output files (default: current working directory)."
    )
    output_options.add_argument(
        "-U",
        "--unfolded-site-frequency-spectrum",
        "--derived-site-frequency-spectrum",
        action="store_true",
        default=False,
        help="Calculate the unfolded or derived site frequency spectrum."
        " Otherwise, defaults to the folded or minor site frequency"
        " spectrum.")
    output_options.add_argument(
        "--infinite-sites-model",
        action="store_true",
        default=False,
        help="Use infinite sites model instead of finite sites.")
    output_options.add_argument(
        "--calculate-single-population-site-frequency-spectrum",
        action="store_true",
        default=False,
        help="Calculate the single (within) population site frequency"
        " spectrum in addition to the joint.")
    output_options.add_argument(
        "--concatenate-loci",
        action="store_true",
        default=False,
        help="Concatenate statistics for all loci into one effective locus.",
    )
    output_options.add_argument(
        "--no-normalize-by-concatenated-loci-count",
        dest="is_normalize_by_concatenated_num_loci",
        action="store_false",
        default=True,
        help=
        "If concatenating loci, do NOT normalize frequency spectrum values by number of loci.",
    )
    output_options.add_argument(
        "--no-normalize-by-site-counts",
        dest="normalize_by_site_counts",
        action="store_false",
        default=True,
        help=
        "Do *not* normalize frequency spectrum values by number of sites in each locus."
    )
    output_options.add_argument(
        "-l",
        "--labels",
        action="append",
        help=
        "Addition field/value pairs to add to the output (in format <FIELD-NAME>:value;)"
    )
    output_options.add_argument(
        '--field-delimiter',
        type=str,
        default='\t',
        help="Delimiter string separating fields in output (default: <TAB>').")
    output_options.add_argument(
        '--summary-stats-label-prefix',
        type=str,
        default='stat',
        metavar='PREFIX',
        help=
        "Prefix for summar statistic field labels (default: '%(default)s').")
    output_options.add_argument(
        "--include-model-id-field",
        action="store_true",
        default=False,
        help=
        "Include a 'model.id' field (with same value as 'param.divTimeModel' field) in output."
    )
    output_options.add_argument(
        "--append",
        action="store_true",
        default=False,
        help="Append instead of overwriting output file(s).")
    output_options.add_argument("--no-write-header",
                                action="store_true",
                                default=False,
                                help="Do not writer header row.")
    output_options.add_argument("--raw-data",
                                action="store_true",
                                default=False,
                                help="Output raw data (alignments and trees).")
    output_options.add_argument("--raw-data-alignment",
                                action="store_true",
                                default=False,
                                help="Output raw alignment.")
    output_options.add_argument("--raw-data-mutation-tree",
                                action="store_true",
                                default=False,
                                help="Output raw mutation tree.")
    output_options.add_argument("--raw-data-true-tree",
                                action="store_true",
                                default=False,
                                help="Output raw true tree.")
    output_options.add_argument(
        "--raw-data-alignment-format",
        default="fasta",
        choices=["fasta", "phylip", "nexus"],
        help=
        "Format for the raw data alignments ('fasta', 'phylip', or 'nexus'; default='fasta')."
    )
    output_options.add_argument(
        "--raw-data-tree-format",
        default="nexus",
        choices=["nexus", "newick", "nexml"],
        help=
        "Format for the raw data trees ('nexus', 'newick', or 'nexml'; default='nexus')."
    )
    output_options.add_argument(
        "--params-only-file",
        action="store_true",
        default=False,
        help=
        "Output file consisting of parameters only (for checking/validation).")

    run_options = parser.add_argument_group("Run Options")
    run_options.add_argument(
        "-n",
        "--num-reps",
        type=int,
        default=1,
        help="Number of replicates (default: %(default)s).")
    run_options.add_argument(
        "-m",
        "--num-processes",
        default=1,
        type=int,
        help="Number of processes/CPU to run (default: %(default)s).")
    run_options.add_argument("-z",
                             "--random-seed",
                             default=None,
                             help="Seed for random number generator engine.")
    run_options.add_argument("-q",
                             "--quiet",
                             action="store_true",
                             help="Work silently.")
    run_options.add_argument('--log-to-file',
                             action='store_true',
                             dest='log_to_file',
                             default=None,
                             help="Save log to file.")
    run_options.add_argument(
        "--log-frequency",
        default=None,
        type=int,
        help=
        "Frequency that background progress messages get written to the log (0: do not log informational messages)."
    )
    run_options.add_argument(
        "--file-logging-level",
        default="none",
        choices=[
            "debug",
            "info",
            "warning",
            "error",
            "critical",
            "none",
        ],
        help="Message level threshold for screen logs (default: %(default)s).")
    run_options.add_argument(
        "--stderr-logging-level",
        default="info",
        choices=[
            "debug",
            "info",
            "warning",
            "error",
            "critical",
            "none",
        ],
        help="Message level threshold for screen logs (default: %(default)s).")
    run_options.add_argument(
        '-w',
        '--working-directory-parent',
        action='store',
        type=str,
        default=None,
        help="Directory within which to create temporary directories and files."
    )
    run_options.add_argument("--no-cleanup",
                             action="store_true",
                             default=False,
                             help="Do not clean-up temporary files.")
    run_options.add_argument("--debug-mode",
                             action="store_true",
                             default=False,
                             help="Run in debugging mode.")

    fsc2_options = parser.add_argument_group("FastSimCoal2 Options")
    fsc2_options.add_argument(
        "--fsc2-path",
        metavar="FSC2-PATH",
        default=os.environ.get("SISTERBAYES_FSC2_PATH", "fsc"),
        help="Path to FastsimCoal2 application (default: %(default)s).")

    args = parser.parse_args()

    config_d = {}
    if not os.path.exists(args.configuration_filepath):
        sys.exit("ERROR: Configuration file '{}' not found.".format(
            args.configuration_filepath))
    utility.parse_legacy_configuration(filepath=args.configuration_filepath,
                                       config_d=config_d)
    config_d["output_prefix"] = utility.output_prefix(
        primary_source_filepath=args.configuration_filepath,
        output_name_prefix=args.output_name_prefix,
        output_directory=args.output_directory)
    if args.log_frequency is None:
        config_d["logging_frequency"] = int(args.num_reps / 10.0)
    elif args.log_frequency == 0:
        config_d["logging_frequency"] = None
    else:
        config_d["logging_frequency"] = args.log_frequency
    config_d["fsc2_path"] = args.fsc2_path
    if utility.which(config_d["fsc2_path"]) is None:
        sys.exit("ERROR: FastSimCoal2 executable '{}' not found.\n"
                 "Install FastSimCoal2 and specify path to the executable\n"
                 "using the '--fsc2-path' argument.".format(
                     config_d["fsc2_path"]))
    config_d["file_logging_level"] = args.file_logging_level
    config_d["standard_error_logging_level"] = args.stderr_logging_level
    config_d["log_to_file"] = args.log_to_file
    config_d["log_to_stderr"] = not args.quiet
    config_d[
        "is_unfolded_site_frequency_spectrum"] = args.unfolded_site_frequency_spectrum
    config_d[
        "is_calculate_single_population_sfs"] = args.calculate_single_population_site_frequency_spectrum
    config_d["is_calculate_joint_population_sfs"] = True
    config_d["is_infinite_sites_model"] = args.infinite_sites_model
    config_d["stat_label_prefix"] = args.summary_stats_label_prefix
    config_d["supplemental_labels"] = utility.parse_fieldname_and_value(
        args.labels)
    config_d["field_delimiter"] = args.field_delimiter
    config_d["is_include_model_id_field"] = args.include_model_id_field
    config_d["is_concatenate_loci"] = args.concatenate_loci
    config_d[
        "is_normalize_by_concatenated_num_loci"] = args.is_normalize_by_concatenated_num_loci
    config_d["is_normalize_by_site_counts"] = args.normalize_by_site_counts
    is_store_raw_alignment = False
    is_store_raw_mutation_tree = False
    is_store_raw_true_tree = False
    if args.raw_data:
        is_store_raw_alignment = True
        is_store_raw_mutation_tree = True
        is_store_raw_true_tree = True
    if args.raw_data_alignment:
        is_store_raw_alignment = True
    if args.raw_data_mutation_tree:
        is_store_raw_mutation_tree = True
    if args.raw_data_true_tree:
        is_store_raw_true_tree = True
    print(is_store_raw_alignment)
    print(is_store_raw_mutation_tree)
    print(is_store_raw_true_tree)
    with utility.TemporaryDirectory(
            prefix="sisterbayes-",
            parent_dir=args.working_directory_parent,
            is_suppress_cleanup=args.no_cleanup) as working_directory:
        config_d["working_directory"] = working_directory
        simulator = simulate.SisterBayesSimulator(
            config_d=config_d,
            num_processes=args.num_processes,
            is_verbose_setup=True,
            package_id=package_id,
            is_store_raw_alignment=is_store_raw_alignment,
            is_store_raw_mutation_tree=is_store_raw_mutation_tree,
            is_store_raw_true_tree=is_store_raw_true_tree,
            raw_data_output_prefix=config_d["output_prefix"],
            raw_data_alignment_format=args.raw_data_alignment_format,
            raw_data_tree_format=args.raw_data_tree_format,
            is_debug_mode=args.debug_mode,
        )
        main_dest_filepath = config_d["output_prefix"] + ".stats.tsv"
        dest = utility.universal_open(main_dest_filepath,
                                      "a" if args.append else "w")
        if args.params_only_file:
            params_only_dest_filepath = config_d[
                "output_prefix"] + ".params.tsv"
            params_only_dest = utility.universal_open(
                params_only_dest_filepath, "a" if args.append else "w")
        else:
            params_only_dest = None
        # dest = utility.open_destput_file_for_csv_writer(
        #         filepath=filepath,
        #         is_append=args.append)
        if args.append or args.no_write_header:
            is_write_header = False
        else:
            is_write_header = True
        with dest:
            # writer = utility.get_csv_writer(
            #         dest=dest,
            #         delimiter=args.field_delimiter)
            try:
                results = simulator.execute(
                    nreps=args.num_reps,
                    dest=dest,
                    results_store=None,
                    params_only_dest=params_only_dest,
                    is_write_header=is_write_header,
                )
            except Exception as e:
                sys.stderr.write(
                    "Traceback (most recent call last):\n  {}{}\n".format(
                        "  ".join(traceback.format_tb(sys.exc_info()[2])), e))
                sys.exit(1)
        if params_only_dest:
            params_only_dest.close()
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser()
    package_id = sisterbayes.package_id()
    parser.add_argument("--version", action="version", version=package_id)

    simulator_options = parser.add_argument_group("Configuration")
    simulator_options.add_argument(
        "configuration_filepath",
        metavar="CONFIGURATION-FILE",
        help="Path to the configuration file listing the data.")
    processing_options = parser.add_argument_group("Processing Options")
    processing_options.add_argument(
        "-U",
        "--unfolded-site-frequency-spectrum",
        "--derived-site-frequency-spectrum",
        action="store_true",
        default=False,
        help="Calculate the unfolded or derived site frequency spectrum."
        " Otherwise, defaults to the folded or minor site frequency"
        " spectrum.")
    processing_options.add_argument(
        "--calculate-single-population-site-frequency-spectrum",
        action="store_true",
        default=False,
        help="Calculate the single (within) population site frequency"
        " spectrum in addition to the joint.")
    processing_options.add_argument(
        "--no-normalize-by-site-counts",
        dest="normalize_by_site_counts",
        action="store_false",
        default=True,
        help=
        "Do *not* normalize frequency spectrum by number of sites in each locus."
    )
    processing_options.add_argument(
        "--concatenate-loci",
        action="store_true",
        default=False,
        help="Collapse all loci and treat as a single locus for calculation.")
    processing_options.add_argument(
        "--concatenated-locus-label",
        default=None,
        help="If concatenating, label for the concatenated locus.")
    output_options = parser.add_argument_group("Output Options")
    output_options.add_argument(
        '-o',
        '--output-name-prefix',
        action='store',
        dest='output_name_prefix',
        type=str,
        default=None,
        metavar='NAME-PREFIX',
        help=
        "Prefix for output filenames (default: same as configuration filename stem)."
    )
    output_options.add_argument(
        '-O',
        '--output-directory',
        action='store',
        dest='output_directory',
        type=str,
        default=None,
        metavar='DIRECTORY',
        help="Directory for output files (default: current working directory)."
    )
    output_options.add_argument(
        "-l",
        "--labels",
        action="append",
        help=
        "Addition field/value pairs to add to the output (in format <FIELD-NAME>:value;)"
    )
    output_options.add_argument(
        '--field-delimiter',
        type=str,
        default='\t',
        help="Delimiter string separating fields in output (default: <TAB>').")
    output_options.add_argument(
        '--summary-stats-label-prefix',
        type=str,
        default='stat',
        metavar='PREFIX',
        help=
        "Prefix for summar statistic field labels (default: '%(default)s').")
    output_options.add_argument(
        "--append",
        action="store_true",
        default=False,
        help="Append instead of overwriting output file(s).")
    output_options.add_argument("--no-write-header",
                                action="store_true",
                                default=False,
                                help="Do not writer header row.")

    args = parser.parse_args()

    config_d = utility.CaseInsensitiveDict()
    utility.parse_legacy_configuration(filepath=args.configuration_filepath,
                                       config_d=config_d)
    config_d["output_prefix"] = utility.output_prefix(
        primary_source_filepath=args.configuration_filepath,
        output_name_prefix=args.output_name_prefix,
        output_directory=args.output_directory)
    config_d[
        "is_unfolded_site_frequency_spectrum"] = args.unfolded_site_frequency_spectrum
    config_d[
        "is_calculate_single_population_sfs"] = args.calculate_single_population_site_frequency_spectrum
    config_d["is_calculate_joint_population_sfs"] = True
    config_d["stat_label_prefix"] = args.summary_stats_label_prefix
    config_d["supplemental_labels"] = utility.parse_fieldname_and_value(
        args.labels)
    config_d["alignment_directory_head"] = os.path.dirname(
        os.path.abspath(args.configuration_filepath))
    config_d["field_delimiter"] = args.field_delimiter
    config_d["is_concatenate_loci"] = args.concatenate_loci
    config_d["concatenated_locus_label"] = args.concatenated_locus_label
    config_d["is_normalize"] = args.normalize_by_site_counts

    sscalc = sumstats.SisterBayesSummaryStatsCalculator(**config_d)
    filepath = config_d["output_prefix"] + ".obs.sumstats.tsv"
    # dest = utility.open_destput_file_for_csv_writer(
    #         filepath=filepath,
    #         is_append=args.append)
    dest = utility.universal_open(filepath, "a" if args.append else "w")
    if args.append or args.no_write_header:
        is_write_header = False
    else:
        is_write_header = True
    with dest:
        # writer = utility.get_csv_writer(
        #         dest=dest,
        #         delimiter=args.field_delimiter)
        try:
            results = sscalc.write_summary_stats(
                dest=dest, results_store=None, is_write_header=is_write_header)
        except Exception as e:
            sys.stderr.write(
                "Traceback (most recent call last):\n  {}{}\n".format(
                    "  ".join(traceback.format_tb(sys.exc_info()[2])), e))
            sys.exit(1)
    def summarize(
        self,
        target_data_filepath,
    ):
        if self.output_name_prefix is None:
            self.output_name_prefix = os.path.splitext(
                os.path.basename(target_data_filepath))[0]
        if self.output_directory is None:
            self.output_directory = "."
        self.output_directory = os.path.realpath(self.output_directory)
        output_prefix = os.path.join(self.output_directory,
                                     self.output_name_prefix)
        with utility.universal_open(target_data_filepath) as src:
            reader = csv.DictReader(src,
                                    delimiter=self.field_delimiter,
                                    quoting=csv.QUOTE_NONE)
            categorical_params = collections.OrderedDict()
            continuous_params = collections.OrderedDict()
            realized_div_time_samples = []
            all_div_times = []
            sp_labels = []
            for row_idx, row in enumerate(reader):
                realized_div_time_samples.append({})
                for key_idx, key in enumerate(reader.fieldnames):
                    if key in categorical_params:
                        categorical_params[key][row[key]] += 1
                    elif key in continuous_params:
                        continuous_params[key].append(float(row[key]))
                    else:
                        if key in ("param.divTimeModel", "param.numDivTimes"):
                            val = row[key]
                            is_categorical = True
                        else:
                            try:
                                val = float(row[key])
                                is_categorical = False
                            except ValueError:
                                val = row[key]
                                is_categorical = True
                        if is_categorical:
                            categorical_params[key] = collections.Counter()
                            categorical_params[key][val] += 1
                        else:
                            continuous_params[key] = [val]
                    if key.startswith("param.divTime."):
                        sp_label = key.replace("param.divTime.", "")
                        realized_div_time_samples[-1][
                            sp_label] = continuous_params[key][-1]
                        all_div_times.append(val)
                        if row_idx == 0:
                            sp_labels.append(sp_label)
            ### EXPERIMENTAL ###
            # categorical_params["param.effectiveDivTimeModel"] = self.cluster_by_relative_difference_threshold(
            #         sp_labels=sp_labels,
            #         realized_div_time_samples=realized_div_time_samples,
            #         all_div_times=all_div_times,
            #         relative_difference_threshold=0.01)
            if self.cluster_criteria is not None:
                if self.cluster_criteria == "bin_size":
                    cluster_results = self.cluster_by_bin_size(
                        sp_labels=sp_labels,
                        realized_div_time_samples=realized_div_time_samples,
                        all_div_times=all_div_times,
                        bin_size=self.cluster_criteria_value)
                elif self.cluster_criteria == "num_bins":
                    cluster_results = self.cluster_by_num_bins(
                        sp_labels=sp_labels,
                        realized_div_time_samples=realized_div_time_samples,
                        all_div_times=all_div_times,
                        num_bins=self.cluster_criteria_value)
                elif self.cluster_criteria == "absolute_difference_threshold":
                    cluster_results = self.cluster_by_absolute_difference_threshold(
                        sp_labels=sp_labels,
                        realized_div_time_samples=realized_div_time_samples,
                        absolute_difference_threshold=self.
                        cluster_criteria_value)
                elif self.cluster_criteria == "relative_difference_threshold":
                    cluster_results = self.cluster_by_relative_difference_threshold(
                        sp_labels=sp_labels,
                        realized_div_time_samples=realized_div_time_samples,
                        all_div_times=all_div_times,
                        relative_difference_threshold=self.
                        cluster_criteria_value)
                else:
                    raise ValueError(
                        "Unrecognized cluster criteria: '{}'".format(
                            self.cluster_criteria))
                categorical_params[
                    "param.effectiveDivTimeModel"] = cluster_results
            ### EXPERIMENTAL ###

            with utility.universal_open(
                    output_prefix + ".summary.continuous.tsv", "w") as dest:
                row_results = collections.OrderedDict()
                for param_idx, param_name in enumerate(continuous_params):
                    values = continuous_params[param_name]
                    row_results["param"] = param_name
                    summary = calclib.summarize(values)
                    row_results["mean"] = summary["mean"]
                    row_results["var"] = summary["var"]
                    row_results["sd"] = summary["sd"]
                    row_results["min"] = summary["range"][0]
                    row_results["max"] = summary["range"][1]
                    try:
                        row_results["hpd5"] = summary["hpd95"][0]
                        row_results["hpd95"] = summary["hpd95"][1]
                    except TypeError:
                        row_results["hpd5"] = "NA"
                        row_results["hpd95"] = "NA"
                    try:
                        row_results["quant5"] = summary["quant_5_95"][0]
                        row_results["quant95"] = summary["quant_5_95"][1]
                    except TypeError:
                        row_results["quant5"] = "NA"
                        row_results["quant95"] = "NA"
                    if param_idx == 0:
                        dest.write(
                            self.field_delimiter.join(row_results.keys()) +
                            "\n")
                    dest.write(
                        self.field_delimiter.join(
                            "{}".format(v)
                            for v in row_results.values()) + "\n")
            for param_idx, param_name in enumerate(categorical_params):
                with utility.universal_open(
                        output_prefix + ".summary.{:02d}.{}.tsv".format(
                            param_idx + 1, param_name), "w") as dest:
                    param_counter = categorical_params[param_name]
                    total = float(sum(param_counter.values()))
                    for category_idx, (category_name,
                                       category_count) in enumerate(
                                           param_counter.most_common()):
                        row_results = collections.OrderedDict()
                        row_results["label"] = category_name
                        row_results["freq"] = category_count / total
                        row_results["count"] = category_count
                        if category_idx == 0:
                            dest.write(
                                self.field_delimiter.join(row_results.keys()) +
                                "\n")
                        dest.write(
                            self.field_delimiter.join(
                                "{}".format(v)
                                for v in row_results.values()) + "\n")
Exemplo n.º 11
0
 def read_files(self, filepaths):
     for file_idx, filepath in enumerate(filepaths):
         self.run_logger.info("Reading file {} of {}: '{}'".format(
             file_idx + 1, len(filepaths), filepath))
         with utility.universal_open(filepath) as src:
             self._read_file(src)