def test_ilp_simple():

    n1 = Node('a', [1, 0, 0, 0, 0])
    n2 = Node('b', [1, 0, 0, 1, 0])
    n3 = Node('c', [1, 0, 0, 2, 0])
    n4 = Node('d', [1, 2, 0, 1, 0])
    n5 = Node('e', [1, 1, 0, 1, 0])
    n6 = Node('f', [1, 0, 3, 2, 0])
    n7 = Node('g', [0, 0, 0, 0, 1])
    n8 = Node('h', [0, 1, 0, 0, 1])
    n9 = Node('i', [0, 1, 2, 0, 1])
    n10 = Node('j', [0, 1, 1, 0, 1])

    nodes = [n1, n2, n3, n4, n5, n6, n7, n8, n9, n10]
    with open(stdout_backup, "w") as f:
        sys.stdout = f
        tree = ls.solve_lineage_instance(nodes, method="ilp")
    os.remove(stdout_backup)

    net = tree.get_network()

    roots = [n for n in net if net.in_degree(n) == 0]

    assert len(roots) == 1

    root = roots[0]

    targets = [n for n in net if n.is_target]

    assert len(targets) == len(nodes)

    for t in targets:
        assert nx.has_path(net, root, t)
def test_on_sim_greedy():

    stree = pic.load(open("test/data/sim_net.pkl", "rb"))
    leaves = stree.get_leaves()

    target_nodes = []
    for l in leaves:
        new_node = Node(l.name, l.get_character_vec())
        target_nodes.append(new_node)

    rtree = ls.solve_lineage_instance(target_nodes, method="greedy")

    rnet = rtree.get_network()
    roots = [n for n in rnet if rnet.in_degree(n) == 0]

    assert len(roots) == 1

    root = roots[0]

    targets = [n for n in rnet if n.is_target]

    assert len(targets) == len(target_nodes)

    for t in targets:
        assert nx.has_path(rnet, root, t)

    multi_parents = [n for n in rnet if rnet.in_degree(n) > 1]

    assert len(multi_parents) == 0
def test_ilp_parallel_evo():

    n = Node('a', [1, 1, 2, 0])
    n2 = Node('b', [1, 1, 3, 0])
    n3 = Node('c', [2, 1, 1, 0])
    n4 = Node('d', [2, 1, 3, 0])
    n5 = Node('e', [1, 3, 1, '-'])
    n6 = Node('f', [1, '-', '-', '1'])
    n7 = Node('g', [1, 1, 0, 2])

    nodes = [n, n2, n3, n4, n5, n6, n7]

    with open(stdout_backup, "w") as f:
        sys.stdout = f
        tree = ls.solve_lineage_instance(nodes, method='ilp')
    os.remove(stdout_backup)

    net = tree.get_network()

    roots = [n for n in net if net.in_degree(n) == 0]

    assert len(roots) == 1

    root = roots[0]

    targets = [n for n in net if n.is_target]

    assert len(targets) == len(nodes)

    for t in targets:
        assert nx.has_path(net, root, t)

    multi_parents = [n for n in net if net.in_degree(n) > 1]

    assert len(multi_parents) == 0
def test_on_sim_hybrid():

    stree = pic.load(open("test/data/sim_net.pkl", "rb"))
    leaves = stree.get_leaves()

    target_nodes = []
    for l in leaves:
        new_node = Node(l.name, l.get_character_vec())
        target_nodes.append(new_node)

    with open(stdout_backup, "w") as f:
        sys.stdout = f
        rtree = ls.solve_lineage_instance(target_nodes,
                                          method="hybrid",
                                          hybrid_subset_cutoff=200,
                                          time_limit=100,
                                          max_neighborhood_size=500,
                                          threads=4)

    os.remove(stdout_backup)

    rnet = rtree.get_network()
    roots = [n for n in rnet if rnet.in_degree(n) == 0]

    assert len(roots) == 1

    root = roots[0]

    targets = [n for n in rnet if n.is_target]

    assert len(targets) == len(target_nodes)

    for t in targets:
        assert nx.has_path(rnet, root, t)

    multi_parents = [n for n in rnet if rnet.in_degree(n) > 1]

    assert len(multi_parents) == 0
示例#5
0
def main():
    """
    Takes in a character matrix, an algorithm, and an output file and
    returns a tree in newick format.

    """
    parser = argparse.ArgumentParser()
    parser.add_argument("netfp", type=str, help="character_matrix")
    parser.add_argument("-nj",
                        "--neighbor-joining",
                        action="store_true",
                        default=False)
    parser.add_argument("--neighbor_joining_weighted",
                        action="store_true",
                        default=False)
    parser.add_argument("--ilp", action="store_true", default=False)
    parser.add_argument("--hybrid", action="store_true", default=False)
    parser.add_argument("--cutoff",
                        type=int,
                        default=80,
                        help="Cutoff for ILP during Hybrid algorithm")
    parser.add_argument(
        "--hybrid_lca_mode",
        action="store_true",
        help=
        "Use LCA distances to transition in hybrid mode, instead of number of cells",
    )
    parser.add_argument("--time_limit",
                        type=int,
                        default=-1,
                        help="Time limit for ILP convergence")
    parser.add_argument(
        "--iter_limit",
        type=int,
        default=-1,
        help="Max number of iterations for ILP solver",
    )
    parser.add_argument("--greedy", "-g", action="store_true", default=False)
    parser.add_argument("--camin-sokal",
                        "-cs",
                        action="store_true",
                        default=False)
    parser.add_argument("--verbose",
                        action="store_true",
                        default=False,
                        help="output verbosity")
    parser.add_argument("--mutation_map", type=str, default="")
    parser.add_argument("--num_threads", type=int, default=1)
    parser.add_argument("--no_triplets", action="store_true", default=False)
    parser.add_argument("--max_neighborhood_size", type=str, default=3000)
    parser.add_argument("--out_fp",
                        type=str,
                        default=None,
                        help="optional output file")
    parser.add_argument("--seed",
                        type=int,
                        default=None,
                        help="Random seed for ILP solver")

    args = parser.parse_args()

    netfp = args.netfp
    outfp = args.out_fp
    verbose = args.verbose

    lca_mode = args.hybrid_lca_mode
    if lca_mode:
        lca_cutoff = args.cutoff
        cell_cutoff = None
    else:
        cell_cutoff = args.cutoff
        lca_cutoff = None
    time_limit = args.time_limit
    iter_limit = args.iter_limit
    num_threads = args.num_threads
    max_neighborhood_size = args.max_neighborhood_size
    seed = args.seed

    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)

    score_triplets = not args.no_triplets

    prior_probs = None
    if args.mutation_map != "":

        prior_probs = pic.load(open(args.mutation_map, "rb"))

    name = netfp.split("/")[-1]
    stem = ".".join(name.split(".")[:-1])

    true_network = nx.read_gpickle(netfp)

    if isinstance(true_network, Cassiopeia_Tree):
        true_network = true_network.get_network()

    target_nodes = get_leaves_of_tree(true_network)

    target_nodes_uniq = []
    seen_charstrings = []
    for t in target_nodes:
        if t.char_string not in seen_charstrings:
            seen_charstrings.append(t.char_string)
            target_nodes_uniq.append(t)

    if args.greedy:

        if verbose:
            print("Running Greedy Algorithm on " +
                  str(len(target_nodes_uniq)) + " Cells")

        reconstructed_network_greedy = solve_lineage_instance(
            target_nodes_uniq,
            method="greedy",
            prior_probabilities=prior_probs)

        net = reconstructed_network_greedy[0]

        if outfp is None:
            outfp = name.replace("true", "greedy")
        pic.dump(net, open(outfp, "wb"))

    elif args.hybrid:

        if verbose:
            print("Running Hybrid Algorithm on " +
                  str(len(target_nodes_uniq)) + " Cells")
            print("Parameters: ILP on sets of " + str(cutoff) + " cells " +
                  str(time_limit) + "s to complete optimization")

        reconstructed_network_hybrid = solve_lineage_instance(
            target_nodes_uniq,
            method="hybrid",
            hybrid_cell_cutoff=cell_cutoff,
            hybrid_lca_cutoff=lca_cutoff,
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            threads=num_threads,
            max_neighborhood_size=max_neighborhood_size,
            seed=seed,
            num_iter=iter_limit,
        )

        net = reconstructed_network_hybrid[0]

        if outfp is None:
            outfp = name.replace("true", "hybrid")
        pic.dump(net, open(outfp, "wb"))

    elif args.ilp:

        if verbose:
            print("Running Hybrid Algorithm on " +
                  str(len(target_nodes_uniq)) + " Cells")
            print("Parameters: ILP on sets of " + str(cutoff) + " cells " +
                  str(time_limit) + "s to complete optimization")

        reconstructed_network_ilp = solve_lineage_instance(
            target_nodes_uniq,
            method="ilp",
            hybrid_subset_cutoff=cutoff,
            prior_probabilities=prior_probs,
            time_limit=time_limit,
            max_neighborhood_size=max_neighborhood_size,
            seed=seed,
            num_iter=iter_limit,
        )

        net = reconstructed_network_ilp[0]
        # reconstructed_network_ilp = nx.relabel_nodes(reconstructed_network_ilp, string_to_sample)
        if outfp is None:
            outfp = name.replace("true", "ilp")
        pic.dump(net, open(outfp, "wb"))

    elif args.neighbor_joining:

        if verbose:
            print("Running Neighbor-Joining on " +
                  str(len(target_nodes_uniq)) + " Unique Cells")

        infile = "".join(name.split(".")[:-1]) + "infile.txt"
        fn = "".join(name.split(".")[:-1]) + "phylo.txt"
        write_leaves_to_charmat(target_nodes_uniq, fn)

        script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py"
        cmd = "python3.6 " + str(
            script) + " " + fn + " " + infile + " --relaxed"
        p = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p.pid, 0)

        aln = AlignIO.read(infile, "phylip-relaxed")

        aln = unique_alignments(aln)

        t0 = time.time()
        calculator = DistanceCalculator("identity", skip_letters="?")
        constructor = DistanceTreeConstructor(calculator, "nj")

        tree = constructor.build_tree(aln)

        tree.root_at_midpoint()

        nj_net = Phylo.to_networkx(tree)

        # convert labels to characters for writing to file
        i = 0
        rndict = {}
        for n in nj_net:

            if n.name is None:
                rndict[n] = Node("state-node", [])
                # n.name = "internal" + str(i)
                # i += 1
            else:
                rndict[n] = Node(n.name, [])

        nj_net = nx.relabel_nodes(nj_net, rndict)

        # convert labels to strings, not Bio.Phylo.Clade objects
        # c2str = map(lambda x: x.name, list(nj_net.nodes()))
        # c2strdict = dict(zip(list(nj_net.nodes()), c2str))
        # nj_net = nx.relabel_nodes(nj_net, c2strdict)

        cm = pd.read_csv(fn, sep="\t", index_col=0)

        cm_lookup = dict(
            zip(
                list(
                    cm.apply(lambda x: "|".join([str(k) for k in x.values]),
                             axis=1)),
                cm.index.values,
            ))

        nj_net = fill_in_tree(nj_net, cm)

        nj_net = tree_collapse(nj_net)

        for n in nj_net:
            if n.char_string in cm_lookup.keys():
                n.is_target = True

        nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net)
        if outfp is None:
            outfp = name.replace("true", "nj")
        pic.dump(nj_net, open(outfp, "wb"))
        # Phylo.write(tree, out, 'newick')

        os.system("rm " + infile)
        os.system("rm " + fn)

    elif args.neighbor_joining_weighted:

        if verbose:
            print("Running Neighbor-Joining with Weighted Scoring on " +
                  str(len(target_nodes_uniq)) + " Unique Cells")

        target_node_charstrings = np.array(
            [t.get_character_vec() for t in target_nodes_uniq])
        dm = compute_distance_mat(target_node_charstrings,
                                  len(target_node_charstrings),
                                  priors=prior_probs)

        ids = [t.name for t in target_nodes_uniq]
        cm_uniq = pd.DataFrame(target_node_charstrings)
        cm_uniq.index = ids
        dm = sp.spatial.distance.squareform(dm)

        dm = DistanceMatrix(dm, ids)

        newick_str = nj(dm, result_constructor=str)

        tree = newick_to_network(newick_str, cm_uniq)

        nj_net = fill_in_tree(tree, cm_uniq)
        nj_net = tree_collapse(nj_net)

        cm_lookup = dict(
            zip(
                list(
                    cm_uniq.apply(
                        lambda x: "|".join([str(k) for k in x.values]),
                        axis=1)),
                cm_uniq.index.values,
            ))

        rdict = {}
        for n in nj_net:
            if n.char_string in cm_lookup:
                n.is_target = True
            else:
                n.is_target = False

        nj_net = Cassiopeia_Tree("neighbor-joining", network=nj_net)
        if outfp is None:
            outfp = name.replace("true", "nj_weighted")
        pic.dump(nj_net, open(outfp, "wb"))

    elif args.camin_sokal:

        if verbose:
            print("Running Camin-Sokal Max Parsimony Algorithm on " +
                  str(len(target_nodes_uniq)) + " Unique Cells")

        samples_to_cells = {}
        indices = []
        for i, n in zip(range(len(target_nodes_uniq)), target_nodes_uniq):
            samples_to_cells["s" + str(i)] = n.name
            indices.append(n.name)
            n.name = str(i)

        infile = "".join(name.split(".")[:-1]) + "_cs_infile.txt"
        fn = "".join(name.split(".")[:-1]) + "_cs_phylo.txt"
        weights_fn = "".join(name.split(".")[:-1]) + "_cs_weights.txt"
        write_leaves_to_charmat(target_nodes_uniq, fn)

        script = SCLT_PATH / "TreeSolver" / "binarize_multistate_charmat.py"
        cmd = "python3.6 " + str(script) + " " + fn + " " + infile
        pi = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(pi.pid, 0)

        weights = construct_weights(infile, weights_fn)

        os.system("touch outfile")
        os.system("touch outtree")

        outfile = stem + "outfile.txt"
        outtree = stem + "outtree.txt"
        # run phylip mix with camin-sokal
        responses = "." + stem + ".temp.txt"
        FH = open(responses, "w")
        current_dir = os.getcwd()
        FH.write(infile + "\n")
        FH.write("F\n" + outfile + "\n")
        FH.write("P\n")
        FH.write("W\n")
        FH.write("Y\n")
        FH.write(weights_fn + "\n")
        FH.write("F\n" + outtree + "\n")
        FH.close()

        t0 = time.time()
        cmd = "~/software/phylip-3.697/exe/mix"
        cmd += " < " + responses + " > screenout1"
        p = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p.pid, 0)

        consense_outtree = stem + "consenseouttree.txt"
        consense_outfile = stem + "consenseoutfile.txt"

        FH = open(responses, "w")
        FH.write(outtree + "\n")
        FH.write("F\n" + consense_outfile + "\n")
        FH.write("Y\n")
        FH.write("F\n" + consense_outtree + "\n")
        FH.close()

        if verbose:
            print("Computing Consensus Tree, elasped time: " +
                  str(time.time() - t0))

        cmd = "~/software/phylip-3.697/exe/consense"
        cmd += " < " + responses + " > screenout"
        p2 = subprocess.Popen(cmd, shell=True)
        pid, ecode = os.waitpid(p2.pid, 0)

        newick_str = ""
        with open(consense_outtree, "r") as f:
            for l in f:
                l = l.strip()
                newick_str += l

        cm = pd.read_csv(fn, sep="\t", index_col=0, dtype=str)
        cm.index = indices

        cs_net = newick_to_network(newick_str, cm)

        for n in cs_net:
            if n.name in samples_to_cells:
                n.name = samples_to_cells[n.name]

        cs_net = fill_in_tree(cs_net, cm)

        cs_net = tree_collapse2(cs_net)

        cm_lookup = dict(
            zip(
                list(
                    cm.apply(lambda x: "|".join([str(k) for k in x.values]),
                             axis=1)),
                cm.index.values,
            ))

        for n in cs_net:
            if n.char_string in cm_lookup.keys():
                n.is_target = True

        cs_net = Cassiopeia_Tree("camin-sokal", network=cs_net)
        if outfp is None:
            outfp = name.replace("true", "cs")
        pic.dump(cs_net, open(outfp, "wb"))

        os.system("rm " + outfile)
        os.system("rm " + responses)
        os.system("rm " + outtree)
        os.system("rm " + consense_outfile)
        os.system("rm " + infile)
        os.system("rm " + fn)

    else:

        raise Exception(
            "Please choose an algorithm from the list: greedy, hybrid, ilp, nj, or camin-sokal"
        )