def load_json(json) -> GraphContainer: """ Construct graph object from JSON representation :param json: Dictionary of JSON file contents """ graph = GraphContainer() for node in json["nodes"]: seqs = node.get("sequences", ()) if "reference" in node: chrom, start, end = parse_region(node["reference"]) graph.add_refNode(chrom, start, end, seqs, node["name"]) elif "position" in node: chrom, start, end = parse_region(node["position"]) graph.add_altNode(chrom, start, end, node["sequence"], seqs, node["name"]) else: graph.nodes[node["name"]] = node for edge in json["edges"]: seqs = edge.get("sequences", ()) graph.add_edge(graph.nodes[edge["from"]], graph.nodes[edge["to"]], seqs) graph.name = json["model_name"] graph.paths = json.get("paths", []) graph.target_regions = json.get("target_regions", []) graph.check() return graph
def convert_vcf(vcf, ref, target_regions=None, ref_node_padding=150, ref_node_max_length=1000, allele_graph=False, simplify=True, alt_paths=False, alt_splitting=False): """ Convert a single VCF file to a graph dictionary :param vcf: file name of the VCF file :param ref: reference FASTA file name :param target_regions: target region list :param ref_node_padding: padding / read length :param ref_node_max_length: maximum length before splitting a reference node :param allele_graph: add edges between any compatible allele pair, not just haplotypes from input :param simplify: simplify the graph :param alt_paths: Add all possible non-reference paths to the graph :param alt_splitting: also split long alt nodes (e.g. long insertions) :return: dictionary containing JSON graph """ graph = GraphContainer("Graph from %s" % vcf) indexed_vcf = tempfile.NamedTemporaryFile(delete=False, suffix=".vcf.gz") try: indexed_vcf.close() # noinspection PyUnresolvedReferences pysam.bcftools.view(vcf, "-o", indexed_vcf.name, "-O", "z", catch_stdout=False) # pylint: disable=no-member # noinspection PyUnresolvedReferences pysam.bcftools.index(indexed_vcf.name) # pylint: disable=no-member regions = map(parse_region, target_regions) if target_regions else [(None,)*3] for (chrom, start, end) in regions: if chrom is not None: logging.info(f"Starting work on region: {chrom}:{start}-{end}") try: vcfGraph = VCFGraph.create_from_vcf( ref, indexed_vcf.name, chrom, start, end, ref_node_padding, allele_graph) except NoVCFRecordsException: logging.info(f"Region {chrom}:{start}-{end} has no VCF records, skipping.") continue logging.info(f"CONSTRUCTED VCF GRAPH:\n{str(vcfGraph)}") chromGraph = vcfGraph.get_graph(allele_graph) if ref_node_max_length: graphUtils.split_ref_nodes(chromGraph, ref_node_max_length, ref_node_padding) if alt_splitting: graphUtils.split_alt_nodes(chromGraph, ref_node_max_length, ref_node_padding) if simplify: graphUtils.remove_empty_nodes(chromGraph) graphUtils.combine_nodes(chromGraph) # Disable edge label simplification for now. May use node-label short-cut later # graphUtils.remove_redundant_edge_labels(graph) chromGraph.check() graphUtils.add_graph(graph, chromGraph) finally: os.remove(indexed_vcf.name) graph.target_regions = target_regions or graph.get_reference_regions() graphUtils.add_source_sink(graph) graphUtils.add_ref_path(graph) if alt_paths: graphUtils.add_alt_paths(graph) graph.check() return graph.json_dict()