def run(self, manager): manager.logger.info("Reading blocks orders data") file_paths = manager.configuration["gos-asm"]["input"]["block_orders_file_paths"] bg = BreakpointGraph() for file_path in file_paths: with open(file_path, "rt") as source: bg.update(breakpoint_graph=GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=False), merge_edges=False) manager.data["gos-asm"]["bg"] = bg manager.logger.info("Reading phylogenetic tree information") tree = NewickReader.from_string(data_string=manager.configuration["gos-asm"]["input"]["phylogenetic_tree"]) manager.data["gos-asm"]["phylogenetic_tree"] = tree full_tmc = Multicolor(*[BGGenome(genome_name) for genome_name in manager.configuration["gos-asm"]["input"]["target_organisms"]]) manager.data["gos-asm"]["target_multicolor"] = full_tmc tree_consistent_target_multicolors = Multicolor.split_colors(full_tmc, guidance=tree.consistent_multicolors, account_for_color_multiplicity_in_guidance=False) for target_multicolor in tree_consistent_target_multicolors[:]: for tree_c_multicolor in deepcopy(tree.consistent_multicolors): if tree_c_multicolor <= target_multicolor \ and tree_c_multicolor not in tree_consistent_target_multicolors \ and len(tree_c_multicolor.colors) > 0: tree_consistent_target_multicolors.append(tree_c_multicolor) tree_consistent_target_multicolors = sorted(tree_consistent_target_multicolors, key=lambda mc: len(mc.hashable_representation), reverse=True) all_target_multicolors = tree_consistent_target_multicolors[:] for i in range(2, len(tree_consistent_target_multicolors) + 1): for comb in itertools.combinations(tree_consistent_target_multicolors[:], i): comb = list(comb) for mc1, mc2 in itertools.combinations(comb, 2): if len(mc1.intersect(mc2).colors) > 0: break else: new_mc = Multicolor() for mc in comb: new_mc += mc all_target_multicolors.append(new_mc) hashed_vertex_tree_consistent_multicolors = {mc.hashable_representation for mc in all_target_multicolors} all_target_multicolors = [Multicolor(*hashed_multicolor) for hashed_multicolor in hashed_vertex_tree_consistent_multicolors] all_target_multicolors = sorted(all_target_multicolors, key=lambda mc: len(mc.hashable_representation), reverse=True) manager.data["gos-asm"]["target_multicolors"] = all_target_multicolors # log_bg_stats(bg=bg, logger=manager.logger) manager.logger.info("Reading repeats-bridges information") manager.data["gos-asm"]["repeats_guidance"] = get_repeats_bridges_guidance( file_name=manager.configuration["gos-asm"]["input"]["repeats_bridges_file"], data=manager.data)
def test_diamond(): multigraph = MultiGraph() multigraph.add_nodes_from(range(4)) breakpoint_graph = BreakpointGraph(multigraph) first_color = ['A', 'C'] second_color = ['B', 'D'] topology = (first_color, second_color) breakpoint_graph.add_edge(0, 1, Multicolor(*[A])) breakpoint_graph.add_edge(2, 3, Multicolor(*[B])) breakpoint_graph.add_edge(1, 2, Multicolor(*[C])) breakpoint_graph.add_edge(0, 3, Multicolor(*[D])) assert (len(find_diamond_patterns(breakpoint_graph)) == 1)
def test_paths(): multigraph = MultiGraph() multigraph.add_nodes_from(range(4)) breakpoint_graph = BreakpointGraph(multigraph) first_color = ['A', 'C'] second_color = ['B', 'D'] topology = (first_color, second_color) breakpoint_graph.add_edge(0, 1, Multicolor(*first_color)) breakpoint_graph.add_edge(2, 3, Multicolor(*first_color)) breakpoint_graph.add_edge(1, 2, Multicolor(*second_color)) breakpoint_graph.add_edge(0, 3, Multicolor(*second_color)) assert (get_size_of_alternating_structures(breakpoint_graph, topology) == 3)
def test_cylinder(): multigraph1 = MultiGraph() multigraph1.add_nodes_from(range(4)) breakpoint_graph1 = BreakpointGraph(multigraph1) double_color = ['A', 'B'] breakpoint_graph1.add_edge(0, 1, Multicolor(*double_color)) breakpoint_graph1.add_edge(2, 3, Multicolor(*double_color)) breakpoint_graph1.add_edge(1, 2, Multicolor('C')) breakpoint_graph1.add_edge(0, 3, Multicolor('D')) assert (len(find_cylinder_patterns(breakpoint_graph1)) == 1) multigraph2 = MultiGraph() multigraph2.add_nodes_from(range(4)) breakpoint_graph2 = BreakpointGraph(multigraph2) double_color = ['A', 'B'] breakpoint_graph2.add_edge(0, 1, Multicolor(*double_color)) breakpoint_graph2.add_edge(2, 3, Multicolor(*double_color)) breakpoint_graph2.add_edge(1, 2, Multicolor('C')) breakpoint_graph2.add_edge(0, 3, Multicolor('C')) assert (len(find_cylinder_patterns(breakpoint_graph2)) == 0)
def run(self, manager): mgra_ex_path = get_from_dict_with_path(manager.configuration, key="executable_path", path=["mgra"]) manager.logger.info("=" * 80) if mgra_ex_path is None: manager.logger.info("MGRA executable path is not supplied, skipping the MGRA based tasks") return manager.logger.info("Preparing data to communicate with MGRA and ontain guidance graph") temp_dir = os.path.join(manager.configuration["gos-asm"]["output"]["dir"], "tmp_mgra") if not os.path.exists(temp_dir): os.mkdir(temp_dir) blocks_file_name = os.path.join(temp_dir, "blocks.txt") config_file_name = os.path.join(temp_dir, "config.cfg") mgra_output_dir_name = os.path.join(temp_dir, "output/") manager.logger.debug("Writing blocks orders in GRIMM format to {file_name}".format(file_name=blocks_file_name)) GRIMMWriter.print_genomes_as_grimm_blocks_orders(bg=manager.data["gos-asm"]["bg"], file_name=blocks_file_name) manager.logger.debug("Writing configuration file for MGRA run to {file_name}".format(file_name=config_file_name)) config = self.create_mgra_config(blocks_file_name=blocks_file_name, manager=manager) with open(config_file_name, "wt") as destination: json.dump(obj=config, fp=destination) manager.logger.info("Running MGRA on prepared configuration") os.system("{mgra_ex_path} -c {config_file_path} -o {output_dir_path}" "".format(mgra_ex_path=mgra_ex_path, config_file_path=config_file_name, output_dir_path=mgra_output_dir_name)) manager.logger.debug("MGRA has successfully finished") manager.logger.info("Reading MGRA produced guidance graph") genomes_dir = os.path.join(mgra_output_dir_name, "genomes") genome_files = [name for name in os.listdir(genomes_dir) if name.endswith(".gen")] full_genomes_paths = [os.path.join(genomes_dir, name) for name in genome_files] guidance_bg = BreakpointGraph() for file_name in full_genomes_paths: with open(file_name, "rt") as source: guidance_bg.update(breakpoint_graph=GRIMMReader.get_breakpoint_graph(stream=source, merge_edges=False), merge_edges=False) if "mgra" not in manager.data: manager.data["mgra"] = {} manager.data["mgra"]["guidance_graph"] = guidance_bg manager.logger.info("Obtained MGRA produced guidance graph")
TARGET_ORGANISM_NAMES = ["human", "chimp", "rat"] COMPLETE_ORGANISM_NAMES = ["dog", "opossum", "cat", "mouse"] ################################################################################################################ # # END OF experiment set up (data) # ################################################################################################################ if __name__ == "__main__": length = sys.argv[1] identity = sys.argv[2] for i in range(0, len(TARGET_ORGANISM_NAMES)): GRIMM_FILES.append(PATH + length + "_" + identity + "/" + TARGET_ORGANISM_NAMES[i] + ".txt") print("Reading data into breakpoint graph...") graph = BreakpointGraph() for file in GRIMM_FILES: with open(file, "rt") as source: graph.update(GRIMMReader.get_breakpoint_graph(source), merge_edges=True) print("Getting a tree...") bgtree = NewickReader.from_string(NEWICK_STRING_TREE) print("Preparing organisms for assembling...") target_organisms = [BGGenome(organism) for organism in TARGET_ORGANISM_NAMES] exclude = [BGGenome(organism) for organism in COMPLETE_ORGANISM_NAMES] print("Staring the assembly process...") result = assemble_scaffolds(graph=graph, bgtree=bgtree, target_organisms=target_organisms, exclude=exclude, verbose=True) print("Finished assembling!")
# print("\t", " ".join( # strand + str(gene_name) for strand, gene_name in grimm_formatted_genomes[genome][scaffold_name]), # "$") print(" ".join(strand + str(gene_name) for strand, gene_name in grimm_formatted_genomes[genome][scaffold_name]), "$", file=target) # print() bg_graphs = dict() for file_name in os.listdir(target_directory): file_name = os.path.join(target_directory, file_name) with open(file_name, "r") as source: bg = GRIMMReader.get_breakpoint_graph(source) bg_graphs[file_name.split(".")[0]] = bg bg = BreakpointGraph() for br_gr in bg_graphs.values(): bg.update(br_gr, merge_edges=True) target_multicolor = Multicolor("Anguilla_japonica") print("Breakpoint graph stats:") print( "\t", "non-infinity nodes count:", len( list(node for node in bg.nodes() if not BGVertex.is_infinity_vertex(node)))) normal_edges, infinity_edges = [], [] for edge in bg.edges(): if edge.is_infinity_edge: infinity_edges.append(edge)
def get_html_report_experiment_entry(experiment): try: module_path, file_name = os.path.split(experiment.config_file_path) if module_path not in sys.path: sys.path.insert(0, module_path) module_name = file_name[:file_name.rfind(".")] if module_name in sys.modules: del sys.modules[module_name] module = importlib.import_module(module_name) config = module.configuration except Exception: raise config = recursive_dict_update(default_configuration.configuration, config) result = [] result.append("<hr>") result.append("<hr>") result.append("<h1>{experiment_name}</h1>".format(experiment_name=config.get("experiment_name", "Experiment XXX"))) reference_chr_fragments_order_full_path = os.path.join(experiment.evaluation_dir_path, "chr_fragments_order.txt") assembled_chains_full_path = os.path.join(config["gos-asm"]["output"]["dir"], config["gos-asm"]["output"]["chains_file"]) assembly_points_file_full_path = os.path.join(config["gos-asm"]["output"]["dir"], config["gos-asm"]["output"]["assembly_points_file"]) assembly_point_evaluation = [] with open(assembly_points_file_full_path, newline="") as csv_file: reader = csv.reader(csv_file, delimiter="|") headers = next(reader) headers = [header.strip() for header in headers] for row in reader: entry = {key: value for key, value in zip(headers, row)} ap = AssemblyPoint.from_assembly_points_file(separated_values=entry) assembly_point_evaluation.append(AssemblyPointEvaluation(ap=ap)) genomes_ref_chr_fragments_orders = defaultdict(list) current_genome = None with open(reference_chr_fragments_order_full_path, "rt") as source: for line in source: if len(line.strip()) == 0 or line.strip().startswith("#"): continue elif line.strip().startswith(">"): current_genome = BGGenome(line.strip()[1:]) genomes_ref_chr_fragments_orders[current_genome].append(line.strip()) elif current_genome is not None: genomes_ref_chr_fragments_orders[current_genome].append(line.strip()) assembled_genome_fragments_orders = defaultdict(list) current_genome = None with open(assembled_chains_full_path, "rt") as source: for line in source: if len(line.strip()) == 0 or line.strip().startswith("#"): continue elif line.strip().startswith(">"): current_genome = BGGenome(line.strip()[1:]) assembled_genome_fragments_orders[current_genome].append(line.strip()) elif current_genome is not None: assembled_genome_fragments_orders[current_genome].append(line.strip()) bg = BreakpointGraph() for file_path in [reference_chr_fragments_order_full_path, assembled_chains_full_path]: with open(file_path, "rt") as source: bg.update(GRIMMReader.get_breakpoint_graph(source, merge_edges=False)) scjs = {} genomes_fragmentation_total = {} target_genomes = [BGGenome(genome_name) for genome_name in config["gos-asm"]["input"]["target_organisms"]] for genome in target_genomes: assembled_chains = assembled_genome_fragments_orders[genome] reference_chains = genomes_ref_chr_fragments_orders[genome] source = assembled_chains + reference_chains bg = GRIMMReader.get_breakpoint_graph(source, merge_edges=False) scjs[genome] = single_cut_and_join_distance(bg) reference_assembly_graph = {} current_genome = None with open(reference_chr_fragments_order_full_path, "rt") as source: for line in source: if len(line.strip()) == 0 or line.strip().startswith("#"): continue elif line.strip().startswith(">"): current_genome = BGGenome(line.strip()[1:]) if current_genome not in reference_assembly_graph: reference_assembly_graph[current_genome] = Graph() genomes_fragmentation_total[current_genome] = 0 elif current_genome is not None: fragments = line.strip().split() *fragments, chr_type = fragments genomes_fragmentation_total[current_genome] += len(fragments) - 1 for fragment1, fragment2 in zip(fragments[:-1], fragments[1:]): v1, v2 = get_pair_of_fragments_vertices(fragment1, fragment2) reference_assembly_graph[current_genome].add_edge(v1, v2, attr_dict={"type": "HC"}) if chr_type == "@": v1, v2 = get_pair_of_fragments_vertices(fragments[-1], fragments[0]) reference_assembly_graph[current_genome].add_edge(v1, v2, attr_dict={"type": "HC"}) for cnt, fragment in enumerate(fragments): closure_fragments = get_closure_fragments(fragments, cnt, chr_type) for fr in closure_fragments: v1, v2 = get_pair_of_fragments_vertices(fragment, fr) if not reference_assembly_graph[current_genome].has_edge(v1, v2): reference_assembly_graph[current_genome].add_edge(v1, v2, attr_dict={"type": "GOC"}) for ap_eval in assembly_point_evaluation: fragment1 = ap_eval.ap.fragment1 fragment2 = ap_eval.ap.fragment2 fragment1_sign = ap_eval.ap.fragment1_sign fragment2_sign = ap_eval.ap.fragment2_sign fr1_suffix = "h" if fragment1_sign == "+" else "t" fr2_suffix = "t" if fragment2_sign == "+" else "h" v1, v2 = fragment1 + fr1_suffix, fragment2 + fr2_suffix graph = reference_assembly_graph[ap_eval.ap.info.target_color.colors.pop()] if graph.has_edge(v1, v2): type_ = graph[v1][v2]["type"] if type_ == "HC": ap_eval.HC = True ap_eval.GOC = True elif type_ == "GOC": ap_eval.GOC = True exp_id = config.get("experiment_name", "Experiment XXX") exp_id = "_".join(exp_id.split()) result.append("<h2>Experiment info</h2>") result.append("<div class='well'>") result.append("<p>" + config.get("experiment_info", "No info").replace("\n", "<br>") + "</p>") result.append("</div>") result.append("<h3>Evaluation</h3>") result.append("<h4>Per target genome</h4>") for genome in target_genomes: g_suffix = exp_id + "_" + genome.name genome_ap = [ap_eval for ap_eval in assembly_point_evaluation if ap_eval.ap.info.target_color.colors.pop().name == genome.name] add_collapse_header(heading=genome.name, suffix=g_suffix, result=result) result.append("<div class='container' style='width:100%;'>") add_collapse_header(heading="Assembly points", suffix=g_suffix + "_assembly_point", result=result, parent_data="collapse_{suffix}".format(suffix=g_suffix)) add_ap_table(genome_ap, result) add_collapse_footer(result=result) result.append("</div>") result.append("<p>SCJ distance between produced assembly and a reference one: <b>{scj_distance}</b></p>".format(scj_distance=scjs[genome])) result.append("<ul>") total_cnt = len(genome_ap) result.append("<li><p>Total # of identified assembly points: <b>{ap_cnt}</b></p></li>".format(ap_cnt=total_cnt)) hc_cnt = len([ap for ap in genome_ap if ap.HC]) result.append("<li><p>Correct assembly points: <b>{HC_cnt}</b></p></li>".format(HC_cnt=hc_cnt)) goc_cnt = len([ap for ap in genome_ap if ap.GOC and not ap.HC]) result.append( "<li><p>Correct from Global Gene Order (GOC) perspective assembly points: <b>{GOC_cnt}</b></p></li>".format(GOC_cnt=goc_cnt)) incorrect_cnt = len([ap for ap in genome_ap if not ap.HC and not ap.GOC]) result.append("<li><p>Incorrect assembly points: <b>{ic_cnt}</b></p></li>".format(ic_cnt=incorrect_cnt)) result.append("</ul>") # result.append("</div>") result.append("<h5>Relative portions of identified assembly points</h5>") result.append("<div class='progress'>") for value, style in zip([hc_cnt, goc_cnt, incorrect_cnt], ["success", "info", "danger"]): value_prs = value * 100 / total_cnt result.append( '<div class="progress-bar progress-bar-{style}" role="progressbar" aria-valuenow="{value}" aria-valuemin="0" aria-valuemax="100" style="width: {value}%">' ''.format(style=style, value=value_prs)) result.append("<span>{value:.2f}%</span>".format(value=value_prs, style=style)) result.append("</div>") result.append("</div>") overall_cnt = genomes_fragmentation_total[genome] result.append("<h5>Absolute portions of identified assembly points</h5>") result.append("<div class='progress'>") for value, style in zip([hc_cnt, goc_cnt, incorrect_cnt], ["success", "info", "danger"]): value_prs = value * 100 / overall_cnt result.append( '<div class="progress-bar progress-bar-{style}" role="progressbar" aria-valuenow="{value}" aria-valuemin="0" aria-valuemax="100" style="width: {value}%">' ''.format(style=style, value=value_prs)) result.append("<span>{value:.2f}%</span>".format(value=value_prs, style=style)) result.append("</div>") add_collapse_footer(result=result) result.append("<h4>Overall results</h4>") add_collapse_header(heading="Assembly_points", suffix=exp_id, result=result) add_ap_table(assembly_point_evaluation, result) add_collapse_footer(result=result) # result.append("<div class='row'>") result.append("<ul>") total_cnt = len(assembly_point_evaluation) result.append("<li><p>Total # of identified assembly points: <b>{ap_cnt}</b></p></li>".format(ap_cnt=total_cnt)) hc_cnt = len([ap for ap in assembly_point_evaluation if ap.HC]) result.append("<li><p>Correct assembly points: <b>{HC_cnt}</b></p></li>".format(HC_cnt=hc_cnt)) goc_cnt = len([ap for ap in assembly_point_evaluation if ap.GOC and not ap.HC]) result.append( "<li><p>Correct from Global Gene Order (GOC) perspective assembly points: <b>{GOC_cnt}</b></p></li>".format(GOC_cnt=goc_cnt)) incorrect_cnt = len([ap for ap in assembly_point_evaluation if not ap.HC and not ap.GOC]) result.append("<li><p>Incorrect assembly points: <b>{ic_cnt}</b></p></li>".format(ic_cnt=incorrect_cnt)) result.append("</ul>") result.append("<h5>Relative portions of identified assembly points</h5>") result.append("<div class='progress'>") for value, style in zip([hc_cnt, goc_cnt, incorrect_cnt], ["success", "info", "danger"]): value_prs = value * 100 / total_cnt result.append( '<div class="progress-bar progress-bar-{style}" role="progressbar" aria-valuenow="{value}" aria-valuemin="0" aria-valuemax="100" style="width: {value}%">' ''.format(style=style, value=value_prs)) result.append("<span>{value:.2f}%</span>".format(value=value_prs, style=style)) result.append("</div>") result.append("</div>") overall_cnt = sum([value for key, value in genomes_fragmentation_total.items() if key in target_genomes]) result.append("<h5>Absolute portions of identified assembly points</h5>") result.append("<div class='progress'>") for value, style in zip([hc_cnt, goc_cnt, incorrect_cnt], ["success", "info", "danger"]): value_prs = value * 100 / overall_cnt result.append( '<div class="progress-bar progress-bar-{style}" role="progressbar" aria-valuenow="{value}" aria-valuemin="0" aria-valuemax="100" style="width: {value}%">' ''.format(style=style, value=value_prs)) result.append("<span>{value:.2f}%</span>".format(value=value_prs, style=style)) result.append("</div>") result.append("</div>") return "\n".join(result)