예제 #1
0
def run_pangtree(maf_path: Path, fasta_path: Path, output_dir: Path,
                 po_output: bool) -> None:
    output_dir = pathtools.get_child_dir(output_dir,
                                         pathtools.get_current_time())
    print(f"Runing pangtree for maf: {maf_path} and fasta: {fasta_path} "
          f"Output in: {output_dir}, include po file: {po_output}.")

    fasta_provider = missings.FromFile(fasta_path)
    maf = msa.Maf(pathtools.get_file_content_stringio(maf_path), maf_path)
    poagraph, dagmaf = builder.build_from_dagmaf(maf, fasta_provider)
    for p in p_values:
        current_output_dir = pathtools.get_child_dir(output_dir,
                                                     str(p).replace(".", "_"))
        stop = at_params.Stop(0.99)
        at = at_builders.build_affinity_tree(poagraph, None,
                                             current_output_dir, stop,
                                             at_params.P(p), True)

        at_newick = at.as_newick(None, separate_leaves=True)

        pathtools.save_to_file(
            at_newick,
            pathtools.get_child_path(current_output_dir,
                                     "affinity_tree.newick"))

        if po_output:
            pangenome_po = po.poagraph_to_PangenomePO(poagraph)
            pathtools.save_to_file(
                pangenome_po,
                pathtools.get_child_path(current_output_dir, "poagraph.po"))

        task_params = json.TaskParameters(
            multialignment_file_path=str(maf_path),
            multialignment_format="maf",
            datatype="nucleotides",
            blosum_file_path="",
            output_path=current_output_dir,
            fasta_provider=fasta_provider,
            fasta_source_file=fasta_path,
            consensus_type="tree",
            stop=str(stop),
            p=str(p),
            output_with_nodes=False)
        pangenomejson = json.to_PangenomeJSON(task_parameters=task_params,
                                              poagraph=poagraph,
                                              dagmaf=dagmaf,
                                              affinity_tree=at)

        pangenome_json_str = json.to_json(pangenomejson)
        pathtools.save_to_file(
            pangenome_json_str,
            pathtools.get_child_path(current_output_dir, "pangenome.json"))
예제 #2
0
    def get_compatibilities(
        self,
        sequences_ids: List[msa.SequenceID],
        consensus_path: SeqPath,
        p: Optional[at_params.P] = at_params.P(1)
    ) -> Dict[msa.SequenceID, Compatibility]:
        """Calculate compatibilities of sequences listed in sequences_ids to given consensus_path. Use P.

        Args:
            sequences_ids: Compatibilities of these seqeunces will be calculated.
            consensus_path: Sequences will be compared to this consensus path.
            p: Parameter P, see affinity tree algorithm parameters.

        Returns:
            Dictionary of sequences_ids and corresponding compatibility to given consensus_path.

        Raises:
            KeyError: If there is no sequence with given ID.
        """

        compatibilities = dict()
        for seq_id in sequences_ids:
            try:
                sequence_paths = self.sequences[seq_id].paths
            except KeyError:
                raise Exception("No sequence with given ID in poagraph.")
            if len(sequence_paths) == 1:
                sequence_path = sequence_paths[0]
            else:
                sequence_path = [
                    node_id for path in sequence_paths for node_id in path
                ]
            compatibilities[seq_id] = Compatibility(
                len(set(sequence_path).intersection(set(consensus_path))) /
                len(sequence_path), p)
        return compatibilities
예제 #3
0
def get_parser() -> argparse.ArgumentParser:
    """Create ArgumentParser for pang module."""

    p = argparse.ArgumentParser(prog='pangtreebuild',
                                description="""This software builds poagraph
                                                and generates affinitytree.""",
                                epilog="""For more information check
                                          github.com/meoke/pangtree""")
    p.add_argument('--output_dir',
                   type=_cli_dir_arg,
                   default=get_default_output_dir(),
                   help='Output directory path.')
    p.add_argument('--multialignment',
                   metavar='MULTIALIGNMENT_PATH',
                   type=_mulitalignment_file,
                   required=True,
                   help='Path to the multialignment file.')
    p.add_argument('--datatype',
                   type=_data_type,
                   default=graph.DataType.Nucleotides,
                   help='\'n\' for nucleotides, \'p\' for proteins. ' +
                   inspect.getdoc(graph.DataType))
    p.add_argument('--metadata',
                   metavar='METADATA_PATH',
                   type=_metadata_file,
                   help='Path to the csv file with metadata. ' +
                   inspect.getdoc(msa.MetadataCSV))
    p.add_argument('--raw_maf',
                   action='store_true',
                   default=False,
                   help="""Poagraph building from maf file parameter. Set if
                           the maf content must not be transformed to DAG
                           before building graph. Poagraph that was build
                           in this way provides affinitytree tree but the
                           affinitytree do not reflect the real life
                           sequences.""")
    p.add_argument('--fasta_provider',
                   metavar="FASTA_PROVIDER",
                   choices=['ncbi', 'file'],
                   help="""'Maf file may not include full _sequences.
                            In such case an additional data source is needed.
                            Use \'ncbi\' for NCBI (activates CACHE option)
                            or \'file\' for file (then provide also
                            FASTA_PATH). MISSING_SYMBOL is used if this
                            argument is omitted.""")
    p.add_argument('--missing_symbol',
                   metavar='MISSING_SYMBOL',
                   type=_cli_arg(missings.MissingBase),
                   help=inspect.getdoc(missings.MissingBase))
    p.add_argument('--cache',
                   action='store_true',
                   help="""Set if fastas downloaded from NCBI should be cached
                           locally in .fastacache folder. Used if Fasta
                           Provider is NCBI. Sequences downloaded from NCBI
                           are stored and reused by this program.""")
    p.add_argument('--fasta_path',
                   metavar="FASTA_PATH",
                   type=_path_if_valid,
                   help="""ZIP archive with fasta files or fasta file used
                        as FASTA_PROVIDER.""")
    p.add_argument('--affinity',
                   choices=['poa', 'tree'],
                   help="""Generate affinity tree. Use \'poa\' for direct
                           result of poa software, \'tree\' for Affinity
                           Tree algorithm.""")
    p.add_argument('--blosum',
                   type=_blosum_file,
                   metavar='BLOSUM_PATH',
                   help='Path to the blosum file. ' +
                   inspect.getdoc(at_params.Blosum))
    p.add_argument('--hbmin',
                   type=_cli_arg(at_params.Hbmin),
                   default=at_params.Hbmin(),
                   help='Simple POA algorithm parameter. '
                   'Hbmin value. ' + inspect.getdoc(at_params.Hbmin))
    p.add_argument('--stop',
                   type=_cli_arg(at_params.Stop),
                   default=at_params.Stop(),
                   help='Tree POA algorithm parameter.' +
                   inspect.getdoc(at_params.Stop))
    p.add_argument('--p',
                   type=_cli_arg(at_params.P),
                   default=at_params.P(),
                   help='Tree consensus algorithm parameter.' +
                   inspect.getdoc(at_params.P))
    p.add_argument('--output_fasta',
                   action='store_true',
                   help="""Set if fasta files for _sequences and
                            affinitytree must be produced.""")
    p.add_argument('--output_po',
                   action='store_true',
                   default=False,
                   help='Set if po file for poagraph must be produced.'),
    p.add_argument('--output_full',
                   action='store_true',
                   default=False,
                   help='Set if the result pangenome.json should contain '
                   'list of nodes ids for sequences and consensuses'),
    p.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   default=False,
                   help='Set if detailed log files must be produced.')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   default=False,
                   help='Set to turn off console logging.')
    return p
예제 #4
0
 def __sub__(
         self, other: Union["Compatibility",
                            at_params.Stop]) -> "Compatibility":
     self._check_p_equality(other)
     return Compatibility(self.value - other.value, at_params.P(self.p))
예제 #5
0
 def __init__(self, compatibility: float, p: at_params.P = at_params.P(1)):
     self.value: float = compatibility**p.value
     self.p: float = p.value
예제 #6
0
class AffinityTreeGenerationTests(unittest.TestCase):
    @data((at_params.P(0.5), graph.Compatibility(0.836660026534076)),
          (at_params.P(1), graph.Compatibility(0.7)),
          (at_params.P(4), graph.Compatibility(0.6561)))
    @unpack
    def test_1_p_parameter_influence(self, p: at_params.P,
                                     expected_cutoff: graph.Compatibility):
        nodes = [
            graph.Node(node_id=nid(0), base=b('T'), aligned_to=None),
            graph.Node(node_id=nid(1), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(2), base=b('G'), aligned_to=None),
            graph.Node(node_id=nid(3), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(4), base=b('C'), aligned_to=None),
            graph.Node(node_id=nid(5), base=b('A'), aligned_to=None),
            graph.Node(node_id=nid(6), base=b('C'), aligned_to=None),
            graph.Node(node_id=nid(7), base=b('G'), aligned_to=None),
            graph.Node(node_id=nid(8), base=b('T'), aligned_to=None),
            graph.Node(node_id=nid(9), base=b('A'), aligned_to=None)
        ]

        sequences = {
            msa.SequenceID('seq0'):
            graph.Sequence(msa.SequenceID('seq0'), [
                graph.SeqPath(
                    [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 18, 9])])
            ], graph.SequenceMetadata({})),
            msa.SequenceID('seq1'):
            graph.Sequence(msa.SequenceID('seq1'), [
                graph.SeqPath(
                    [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 8, 9])])
            ], graph.SequenceMetadata({})),
            msa.SequenceID('seq2'):
            graph.Sequence(msa.SequenceID('seq2'), [
                graph.SeqPath(
                    [*map(nid, [10, 11, 12, 13, 14, 15, 16, 7, 8, 9])])
            ], graph.SequenceMetadata({})),
            msa.SequenceID('seq3'):
            graph.Sequence(msa.SequenceID('seq3'), [
                graph.SeqPath([*map(nid, [10, 11, 12, 3, 4, 5, 6, 7, 8, 9])])
            ], graph.SequenceMetadata({})),
            msa.SequenceID('seq4'):
            graph.Sequence(
                msa.SequenceID('seq3'),
                [graph.SeqPath([*map(nid, [10, 11, 2, 3, 4, 5, 6, 7, 8, 9])])],
                graph.SequenceMetadata({}))
        }

        poagraph = graph.Poagraph(nodes, sequences)

        consensus_path = graph.SeqPath(
            [*map(nid, [10, 11, 12, 13, 14, 15, 16, 17, 18, 19])])
        compatibilities = poagraph.get_compatibilities(
            poagraph.get_sequences_ids(), consensus_path, p)

        actual_cutoff = at_builders._find_node_cutoff(
            [c for c in compatibilities.values()], []).cutoff
        self.assertAlmostEqual(expected_cutoff.value, actual_cutoff.value)

    @data(
        # single compatibility value
        (0.5, [graph.Compatibility(0.5)]),

        # two compatibilities values
        (0.7, [graph.Compatibility(0.5),
               graph.Compatibility(0.7)]),
        (1, [graph.Compatibility(1),
             graph.Compatibility(0.45)]),
        (0.9, [graph.Compatibility(0.9),
               graph.Compatibility(0.5)]),

        # repeated values
        (0.7, [*map(graph.Compatibility, [0.5, 0.7, 0.7])]),
        (0.9, [*map(graph.Compatibility, [0.9, 0.5, 0.5])]),
        (1, [*map(graph.Compatibility, [0.45, 1, 0.45, 0.45])]),

        # many unique compatibilities values
        (.8, [*map(graph.Compatibility, [.3, .4, .8])]),
        (0.91,
         [*map(graph.Compatibility, [0.31, 0.32, 0.91, 0.92, 0.93, 0.97])]),
        (0.91, [
            *map(graph.Compatibility,
                 [0.29, 0.3, 0.33, 0.91, 0.92, 0.93, 0.97])
        ]),
        (1, [*map(graph.Compatibility, [0.81, 0.75, 0.8, 0.81, 1])]),
        (0.9, [*map(graph.Compatibility, [0.5, 0.9, 0.99])]),
        (0.7, [*map(graph.Compatibility, [0.2, 0.85, 0.7, 0.8])]),
        (0.99, [*map(graph.Compatibility, [0.99, 0.9, 0.99])]),
        (0.99, [*map(graph.Compatibility, [0.99])]),

        # repeated distance between values
        (.4, [*map(graph.Compatibility, [.3, .4, .5])]),

        # all the same values
        (.1, [*map(graph.Compatibility, [.1, .1, .1])]))
    @unpack
    def test_2_find_cutoff_no_so_far_values(
            self, expected_cutoff: float,
            compatibilities: List[graph.Compatibility]):
        actual_cutoff = at_builders._find_node_cutoff(compatibilities,
                                                      []).cutoff
        self.assertEqual(expected_cutoff, actual_cutoff.value)

    def test_3_find_cutoff_no_compatibilities(self):
        with self.assertRaises(ValueError) as err:
            _ = at_builders._find_node_cutoff([], []).cutoff
            self.assertEqual(
                str(err.exception), """Empty compatibilities list.
                                                    Cannot find cutoff.""")

    @data(
        # guard <= all compatibilities
        (0.2, [0.2, 0.7, 0.8, 0.85], [0.1, 0.01, 0]),
        (0.7, [0.7, 0.85, 0.7, 0.8], [0.1, 0.01, 0]),
        (0.8, [0.7, 0.7, 0.85, 0.8], [0.85, 0.91, 1.0]),

        # guard > all compatibilities
        (0.6, [0.3, 0.6, 0.61, 0.61], [0.99]),  # big distance to guard
        (0.9, [0.2, 0.97, 0.98, 0.9], [0.99]),  # small distance to guard

        # guard between compatibilities
        (0.5, [0.2, 0.57, 0.58, 0.5], [0.55]),  # take smaller than guard
        (0.58, [0.2, 0.27, 0.58, 0.2], [0.55]),  # take greater than guard
        (0.55, [0.2, 0.58, 0.27, 0.55], [0.55])  # take equal to guard
    )
    @unpack
    def test_4_find_cutoff_with_so_far_values(self, expected_cutoff,
                                              compatibilities, so_far_cutoffs):
        compatibilities = [graph.Compatibility(c) for c in compatibilities]
        so_far_cutoffs = [graph.Compatibility(c) for c in so_far_cutoffs]
        actual_cutoff = at_builders._find_node_cutoff(compatibilities,
                                                      so_far_cutoffs).cutoff
        self.assertEqual(expected_cutoff, actual_cutoff.value)