示例#1
0
文件: input.py 项目: aganezov/gos-asm
    def run(self, manager):
        manager.logger.info("Reading blocks orders data")
        file_paths = manager.configuration["gos-asm"]["input"]["block_orders_file_paths"]
        bg = BreakpointGraph()
        for file_path in file_paths:
            with open(file_path, "rt") as source:
                bg.update(breakpoint_graph=GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=False), merge_edges=False)
        manager.data["gos-asm"]["bg"] = bg

        manager.logger.info("Reading phylogenetic tree information")
        tree = NewickReader.from_string(data_string=manager.configuration["gos-asm"]["input"]["phylogenetic_tree"])
        manager.data["gos-asm"]["phylogenetic_tree"] = tree

        full_tmc = Multicolor(*[BGGenome(genome_name) for genome_name in manager.configuration["gos-asm"]["input"]["target_organisms"]])
        manager.data["gos-asm"]["target_multicolor"] = full_tmc
        tree_consistent_target_multicolors = Multicolor.split_colors(full_tmc,
                                                                     guidance=tree.consistent_multicolors,
                                                                     account_for_color_multiplicity_in_guidance=False)

        for target_multicolor in tree_consistent_target_multicolors[:]:
            for tree_c_multicolor in deepcopy(tree.consistent_multicolors):
                if tree_c_multicolor <= target_multicolor \
                        and tree_c_multicolor not in tree_consistent_target_multicolors \
                        and len(tree_c_multicolor.colors) > 0:
                    tree_consistent_target_multicolors.append(tree_c_multicolor)

        tree_consistent_target_multicolors = sorted(tree_consistent_target_multicolors,
                                                    key=lambda mc: len(mc.hashable_representation),
                                                    reverse=True)

        all_target_multicolors = tree_consistent_target_multicolors[:]
        for i in range(2, len(tree_consistent_target_multicolors) + 1):
            for comb in itertools.combinations(tree_consistent_target_multicolors[:], i):
                comb = list(comb)
                for mc1, mc2 in itertools.combinations(comb, 2):
                    if len(mc1.intersect(mc2).colors) > 0:
                        break
                else:
                    new_mc = Multicolor()
                    for mc in comb:
                        new_mc += mc
                    all_target_multicolors.append(new_mc)
        hashed_vertex_tree_consistent_multicolors = {mc.hashable_representation for mc in all_target_multicolors}
        all_target_multicolors = [Multicolor(*hashed_multicolor) for hashed_multicolor in
                                  hashed_vertex_tree_consistent_multicolors]
        all_target_multicolors = sorted(all_target_multicolors,
                                        key=lambda mc: len(mc.hashable_representation),
                                        reverse=True)
        manager.data["gos-asm"]["target_multicolors"] = all_target_multicolors
        # log_bg_stats(bg=bg, logger=manager.logger)

        manager.logger.info("Reading repeats-bridges information")
        manager.data["gos-asm"]["repeats_guidance"] = get_repeats_bridges_guidance(
            file_name=manager.configuration["gos-asm"]["input"]["repeats_bridges_file"], data=manager.data)
示例#2
0
 def test_diamond():
     multigraph = MultiGraph()
     multigraph.add_nodes_from(range(4))
     breakpoint_graph = BreakpointGraph(multigraph)
     first_color = ['A', 'C']
     second_color = ['B', 'D']
     topology = (first_color, second_color)
     breakpoint_graph.add_edge(0, 1, Multicolor(*[A]))
     breakpoint_graph.add_edge(2, 3, Multicolor(*[B]))
     breakpoint_graph.add_edge(1, 2, Multicolor(*[C]))
     breakpoint_graph.add_edge(0, 3, Multicolor(*[D]))
     assert (len(find_diamond_patterns(breakpoint_graph)) == 1)
示例#3
0
 def test_paths():
     multigraph = MultiGraph()
     multigraph.add_nodes_from(range(4))
     breakpoint_graph = BreakpointGraph(multigraph)
     first_color = ['A', 'C']
     second_color = ['B', 'D']
     topology = (first_color, second_color)
     breakpoint_graph.add_edge(0, 1, Multicolor(*first_color))
     breakpoint_graph.add_edge(2, 3, Multicolor(*first_color))
     breakpoint_graph.add_edge(1, 2, Multicolor(*second_color))
     breakpoint_graph.add_edge(0, 3, Multicolor(*second_color))
     assert (get_size_of_alternating_structures(breakpoint_graph, topology) == 3)
示例#4
0
 def test_cylinder():
     multigraph1 = MultiGraph()
     multigraph1.add_nodes_from(range(4))
     breakpoint_graph1 = BreakpointGraph(multigraph1)
     double_color = ['A', 'B']
     breakpoint_graph1.add_edge(0, 1, Multicolor(*double_color))
     breakpoint_graph1.add_edge(2, 3, Multicolor(*double_color))
     breakpoint_graph1.add_edge(1, 2, Multicolor('C'))
     breakpoint_graph1.add_edge(0, 3, Multicolor('D'))
     assert (len(find_cylinder_patterns(breakpoint_graph1)) == 1)
     multigraph2 = MultiGraph()
     multigraph2.add_nodes_from(range(4))
     breakpoint_graph2 = BreakpointGraph(multigraph2)
     double_color = ['A', 'B']
     breakpoint_graph2.add_edge(0, 1, Multicolor(*double_color))
     breakpoint_graph2.add_edge(2, 3, Multicolor(*double_color))
     breakpoint_graph2.add_edge(1, 2, Multicolor('C'))
     breakpoint_graph2.add_edge(0, 3, Multicolor('C'))
     assert (len(find_cylinder_patterns(breakpoint_graph2)) == 0)
示例#5
0
    def run(self, manager):
        mgra_ex_path = get_from_dict_with_path(manager.configuration, key="executable_path", path=["mgra"])
        manager.logger.info("=" * 80)
        if mgra_ex_path is None:
            manager.logger.info("MGRA executable path is not supplied, skipping the MGRA based tasks")
            return
        manager.logger.info("Preparing data to communicate with MGRA and ontain guidance graph")
        temp_dir = os.path.join(manager.configuration["gos-asm"]["output"]["dir"], "tmp_mgra")
        if not os.path.exists(temp_dir):
            os.mkdir(temp_dir)
        blocks_file_name = os.path.join(temp_dir, "blocks.txt")
        config_file_name = os.path.join(temp_dir, "config.cfg")
        mgra_output_dir_name = os.path.join(temp_dir, "output/")

        manager.logger.debug("Writing blocks orders in GRIMM format to {file_name}".format(file_name=blocks_file_name))
        GRIMMWriter.print_genomes_as_grimm_blocks_orders(bg=manager.data["gos-asm"]["bg"],
                                                         file_name=blocks_file_name)

        manager.logger.debug("Writing configuration file for MGRA run to {file_name}".format(file_name=config_file_name))
        config = self.create_mgra_config(blocks_file_name=blocks_file_name, manager=manager)
        with open(config_file_name, "wt") as destination:
            json.dump(obj=config, fp=destination)
        manager.logger.info("Running MGRA on prepared configuration")
        os.system("{mgra_ex_path} -c {config_file_path} -o {output_dir_path}"
                  "".format(mgra_ex_path=mgra_ex_path,
                            config_file_path=config_file_name,
                            output_dir_path=mgra_output_dir_name))
        manager.logger.debug("MGRA has successfully finished")
        manager.logger.info("Reading MGRA produced guidance graph")

        genomes_dir = os.path.join(mgra_output_dir_name, "genomes")
        genome_files = [name for name in os.listdir(genomes_dir) if name.endswith(".gen")]
        full_genomes_paths = [os.path.join(genomes_dir, name) for name in genome_files]
        guidance_bg = BreakpointGraph()
        for file_name in full_genomes_paths:
            with open(file_name, "rt") as source:
                guidance_bg.update(breakpoint_graph=GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=False), merge_edges=False)
        if "mgra" not in manager.data:
            manager.data["mgra"] = {}
        manager.data["mgra"]["guidance_graph"] = guidance_bg
        manager.logger.info("Obtained MGRA produced guidance graph")
TARGET_ORGANISM_NAMES = ["human", "chimp", "rat"]
COMPLETE_ORGANISM_NAMES = ["dog", "opossum", "cat", "mouse"]

################################################################################################################
#
# END OF experiment set up (data)
#
################################################################################################################

if __name__ == "__main__":
    length = sys.argv[1]
    identity = sys.argv[2]
    for i in range(0, len(TARGET_ORGANISM_NAMES)):
        GRIMM_FILES.append(PATH + length + "_" + identity + "/" + TARGET_ORGANISM_NAMES[i] + ".txt")
    print("Reading data into breakpoint graph...")
    graph = BreakpointGraph()
    for file in GRIMM_FILES:
        with open(file, "rt") as source:
            graph.update(GRIMMReader.get_breakpoint_graph(source), merge_edges=True)

    print("Getting a tree...")
    bgtree = NewickReader.from_string(NEWICK_STRING_TREE)

    print("Preparing organisms for assembling...")
    target_organisms = [BGGenome(organism) for organism in TARGET_ORGANISM_NAMES]
    exclude = [BGGenome(organism) for organism in COMPLETE_ORGANISM_NAMES]

    print("Staring the assembly process...")
    result = assemble_scaffolds(graph=graph, bgtree=bgtree, target_organisms=target_organisms, exclude=exclude,
                                verbose=True)
    print("Finished assembling!")
                # print("\t", " ".join(
                #     strand + str(gene_name) for strand, gene_name in grimm_formatted_genomes[genome][scaffold_name]),
                #       "$")
                print(" ".join(strand + str(gene_name) for strand, gene_name in
                               grimm_formatted_genomes[genome][scaffold_name]),
                      "$",
                      file=target)
        # print()

    bg_graphs = dict()
    for file_name in os.listdir(target_directory):
        file_name = os.path.join(target_directory, file_name)
        with open(file_name, "r") as source:
            bg = GRIMMReader.get_breakpoint_graph(source)
            bg_graphs[file_name.split(".")[0]] = bg
    bg = BreakpointGraph()
    for br_gr in bg_graphs.values():
        bg.update(br_gr, merge_edges=True)

    target_multicolor = Multicolor("Anguilla_japonica")

    print("Breakpoint graph stats:")
    print(
        "\t", "non-infinity nodes count:",
        len(
            list(node for node in bg.nodes()
                 if not BGVertex.is_infinity_vertex(node))))
    normal_edges, infinity_edges = [], []
    for edge in bg.edges():
        if edge.is_infinity_edge:
            infinity_edges.append(edge)
示例#8
0
def get_html_report_experiment_entry(experiment):
    try:
        module_path, file_name = os.path.split(experiment.config_file_path)
        if module_path not in sys.path:
            sys.path.insert(0, module_path)
        module_name = file_name[:file_name.rfind(".")]
        if module_name in sys.modules:
            del sys.modules[module_name]
        module = importlib.import_module(module_name)
        config = module.configuration
    except Exception:
        raise
    config = recursive_dict_update(default_configuration.configuration, config)

    result = []
    result.append("<hr>")
    result.append("<hr>")
    result.append("<h1>{experiment_name}</h1>".format(experiment_name=config.get("experiment_name", "Experiment XXX")))

    reference_chr_fragments_order_full_path = os.path.join(experiment.evaluation_dir_path, "chr_fragments_order.txt")
    assembled_chains_full_path = os.path.join(config["gos-asm"]["output"]["dir"], config["gos-asm"]["output"]["chains_file"])
    assembly_points_file_full_path = os.path.join(config["gos-asm"]["output"]["dir"], config["gos-asm"]["output"]["assembly_points_file"])

    assembly_point_evaluation = []
    with open(assembly_points_file_full_path, newline="") as csv_file:
        reader = csv.reader(csv_file, delimiter="|")
        headers = next(reader)
        headers = [header.strip() for header in headers]
        for row in reader:
            entry = {key: value for key, value in zip(headers, row)}
            ap = AssemblyPoint.from_assembly_points_file(separated_values=entry)
            assembly_point_evaluation.append(AssemblyPointEvaluation(ap=ap))

    genomes_ref_chr_fragments_orders = defaultdict(list)
    current_genome = None
    with open(reference_chr_fragments_order_full_path, "rt") as source:
        for line in source:
            if len(line.strip()) == 0 or line.strip().startswith("#"):
                continue
            elif line.strip().startswith(">"):
                current_genome = BGGenome(line.strip()[1:])
                genomes_ref_chr_fragments_orders[current_genome].append(line.strip())
            elif current_genome is not None:
                genomes_ref_chr_fragments_orders[current_genome].append(line.strip())

    assembled_genome_fragments_orders = defaultdict(list)
    current_genome = None
    with open(assembled_chains_full_path, "rt") as source:
        for line in source:
            if len(line.strip()) == 0 or line.strip().startswith("#"):
                continue
            elif line.strip().startswith(">"):
                current_genome = BGGenome(line.strip()[1:])
                assembled_genome_fragments_orders[current_genome].append(line.strip())
            elif current_genome is not None:
                assembled_genome_fragments_orders[current_genome].append(line.strip())

    bg = BreakpointGraph()
    for file_path in [reference_chr_fragments_order_full_path, assembled_chains_full_path]:
        with open(file_path, "rt") as source:
            bg.update(GRIMMReader.get_breakpoint_graph(source, merge_edges=False))

    scjs = {}
    genomes_fragmentation_total = {}
    target_genomes = [BGGenome(genome_name) for genome_name in config["gos-asm"]["input"]["target_organisms"]]

    for genome in target_genomes:
        assembled_chains = assembled_genome_fragments_orders[genome]
        reference_chains = genomes_ref_chr_fragments_orders[genome]
        source = assembled_chains + reference_chains
        bg = GRIMMReader.get_breakpoint_graph(source, merge_edges=False)
        scjs[genome] = single_cut_and_join_distance(bg)

    reference_assembly_graph = {}
    current_genome = None
    with open(reference_chr_fragments_order_full_path, "rt") as source:
        for line in source:
            if len(line.strip()) == 0 or line.strip().startswith("#"):
                continue
            elif line.strip().startswith(">"):
                current_genome = BGGenome(line.strip()[1:])
                if current_genome not in reference_assembly_graph:
                    reference_assembly_graph[current_genome] = Graph()
                    genomes_fragmentation_total[current_genome] = 0
            elif current_genome is not None:
                fragments = line.strip().split()
                *fragments, chr_type = fragments
                genomes_fragmentation_total[current_genome] += len(fragments) - 1
                for fragment1, fragment2 in zip(fragments[:-1], fragments[1:]):
                    v1, v2 = get_pair_of_fragments_vertices(fragment1, fragment2)
                    reference_assembly_graph[current_genome].add_edge(v1, v2, attr_dict={"type": "HC"})
                if chr_type == "@":
                    v1, v2 = get_pair_of_fragments_vertices(fragments[-1], fragments[0])
                    reference_assembly_graph[current_genome].add_edge(v1, v2, attr_dict={"type": "HC"})
                for cnt, fragment in enumerate(fragments):
                    closure_fragments = get_closure_fragments(fragments, cnt, chr_type)
                    for fr in closure_fragments:
                        v1, v2 = get_pair_of_fragments_vertices(fragment, fr)
                        if not reference_assembly_graph[current_genome].has_edge(v1, v2):
                            reference_assembly_graph[current_genome].add_edge(v1, v2, attr_dict={"type": "GOC"})


    for ap_eval in assembly_point_evaluation:
        fragment1 = ap_eval.ap.fragment1
        fragment2 = ap_eval.ap.fragment2
        fragment1_sign = ap_eval.ap.fragment1_sign
        fragment2_sign = ap_eval.ap.fragment2_sign
        fr1_suffix = "h" if fragment1_sign == "+" else "t"
        fr2_suffix = "t" if fragment2_sign == "+" else "h"
        v1, v2 = fragment1 + fr1_suffix, fragment2 + fr2_suffix
        graph = reference_assembly_graph[ap_eval.ap.info.target_color.colors.pop()]
        if graph.has_edge(v1, v2):
            type_ = graph[v1][v2]["type"]
            if type_ == "HC":
                ap_eval.HC = True
                ap_eval.GOC = True
            elif type_ == "GOC":
                ap_eval.GOC = True

    exp_id = config.get("experiment_name", "Experiment XXX")
    exp_id = "_".join(exp_id.split())

    result.append("<h2>Experiment info</h2>")
    result.append("<div class='well'>")
    result.append("<p>" + config.get("experiment_info", "No info").replace("\n", "<br>") + "</p>")
    result.append("</div>")

    result.append("<h3>Evaluation</h3>")
    result.append("<h4>Per target genome</h4>")
    for genome in target_genomes:
        g_suffix = exp_id + "_" + genome.name
        genome_ap = [ap_eval for ap_eval in assembly_point_evaluation if ap_eval.ap.info.target_color.colors.pop().name == genome.name]
        add_collapse_header(heading=genome.name, suffix=g_suffix, result=result)
        result.append("<div class='container' style='width:100%;'>")

        add_collapse_header(heading="Assembly points", suffix=g_suffix + "_assembly_point", result=result, parent_data="collapse_{suffix}".format(suffix=g_suffix))
        add_ap_table(genome_ap, result)
        add_collapse_footer(result=result)
        result.append("</div>")
        result.append("<p>SCJ distance between produced assembly and a reference one: <b>{scj_distance}</b></p>".format(scj_distance=scjs[genome]))

        result.append("<ul>")
        total_cnt = len(genome_ap)
        result.append("<li><p>Total # of identified assembly points: <b>{ap_cnt}</b></p></li>".format(ap_cnt=total_cnt))
        hc_cnt = len([ap for ap in genome_ap if ap.HC])

        result.append("<li><p>Correct assembly points: <b>{HC_cnt}</b></p></li>".format(HC_cnt=hc_cnt))
        goc_cnt = len([ap for ap in genome_ap if ap.GOC and not ap.HC])

        result.append(
                "<li><p>Correct from Global Gene Order (GOC) perspective assembly points: <b>{GOC_cnt}</b></p></li>".format(GOC_cnt=goc_cnt))
        incorrect_cnt = len([ap for ap in genome_ap if not ap.HC and not ap.GOC])
        result.append("<li><p>Incorrect assembly points: <b>{ic_cnt}</b></p></li>".format(ic_cnt=incorrect_cnt))

        result.append("</ul>")

        # result.append("</div>")

        result.append("<h5>Relative portions of identified assembly points</h5>")
        result.append("<div class='progress'>")

        for value, style in zip([hc_cnt, goc_cnt, incorrect_cnt],  ["success", "info", "danger"]):
            value_prs = value * 100 / total_cnt
            result.append(
                    '<div class="progress-bar progress-bar-{style}" role="progressbar" aria-valuenow="{value}" aria-valuemin="0" aria-valuemax="100" style="width: {value}%">'
                    ''.format(style=style, value=value_prs))
            result.append("<span>{value:.2f}%</span>".format(value=value_prs, style=style))
            result.append("</div>")
        result.append("</div>")

        overall_cnt = genomes_fragmentation_total[genome]

        result.append("<h5>Absolute portions of identified assembly points</h5>")
        result.append("<div class='progress'>")

        for value, style in zip([hc_cnt, goc_cnt, incorrect_cnt],  ["success", "info", "danger"]):
            value_prs = value * 100 / overall_cnt
            result.append(
                    '<div class="progress-bar progress-bar-{style}" role="progressbar" aria-valuenow="{value}" aria-valuemin="0" aria-valuemax="100" style="width: {value}%">'
                    ''.format(style=style, value=value_prs))
            result.append("<span>{value:.2f}%</span>".format(value=value_prs, style=style))
            result.append("</div>")
        add_collapse_footer(result=result)

    result.append("<h4>Overall results</h4>")

    add_collapse_header(heading="Assembly_points", suffix=exp_id, result=result)
    add_ap_table(assembly_point_evaluation, result)
    add_collapse_footer(result=result)

    # result.append("<div class='row'>")
    result.append("<ul>")
    total_cnt = len(assembly_point_evaluation)
    result.append("<li><p>Total # of identified assembly points: <b>{ap_cnt}</b></p></li>".format(ap_cnt=total_cnt))
    hc_cnt = len([ap for ap in assembly_point_evaluation if ap.HC])

    result.append("<li><p>Correct assembly points: <b>{HC_cnt}</b></p></li>".format(HC_cnt=hc_cnt))
    goc_cnt = len([ap for ap in assembly_point_evaluation if ap.GOC and not ap.HC])

    result.append(
            "<li><p>Correct from Global Gene Order (GOC) perspective assembly points: <b>{GOC_cnt}</b></p></li>".format(GOC_cnt=goc_cnt))
    incorrect_cnt = len([ap for ap in assembly_point_evaluation if not ap.HC and not ap.GOC])
    result.append("<li><p>Incorrect assembly points: <b>{ic_cnt}</b></p></li>".format(ic_cnt=incorrect_cnt))

    result.append("</ul>")

    result.append("<h5>Relative portions of identified assembly points</h5>")
    result.append("<div class='progress'>")

    for value, style in zip([hc_cnt, goc_cnt, incorrect_cnt],  ["success", "info", "danger"]):
        value_prs = value * 100 / total_cnt
        result.append(
                '<div class="progress-bar progress-bar-{style}" role="progressbar" aria-valuenow="{value}" aria-valuemin="0" aria-valuemax="100" style="width: {value}%">'
                ''.format(style=style, value=value_prs))
        result.append("<span>{value:.2f}%</span>".format(value=value_prs, style=style))
        result.append("</div>")
    result.append("</div>")

    overall_cnt = sum([value for key, value in genomes_fragmentation_total.items() if key in target_genomes])
    result.append("<h5>Absolute portions of identified assembly points</h5>")
    result.append("<div class='progress'>")

    for value, style in zip([hc_cnt, goc_cnt, incorrect_cnt],  ["success", "info", "danger"]):
        value_prs = value * 100 / overall_cnt
        result.append(
                '<div class="progress-bar progress-bar-{style}" role="progressbar" aria-valuenow="{value}" aria-valuemin="0" aria-valuemax="100" style="width: {value}%">'
                ''.format(style=style, value=value_prs))
        result.append("<span>{value:.2f}%</span>".format(value=value_prs, style=style))
        result.append("</div>")
    result.append("</div>")
    return "\n".join(result)