def test_overlapping_deletion(self): graph = graphContainer.GraphContainer() n = graph.add_altNode("chr", 10, 17, "ATCGATCG") varDel = {"start": 2, "end": 4, "alt": ""} varSNV = {"start": 4, "end": 4, "alt": "C"} variants.add_variants(graph, {n['name']: [varDel, varSNV]}) graphUtils.remove_empty_nodes(graph) nodeNames = [n['name'] for n in graph.nodes.values()] self.assertCountEqual(nodeNames, [ "chr:10-11:AT", "chr:12-13:CG", "chr:14-14:A", "chr:14-14:C", "chr:15-17:TCG" ]) left = graph.nodes["chr:10-11:AT"] right = graph.nodes["chr:15-17:TCG"] alt = graph.nodes["chr:14-14:C"] ref = graph.nodes["chr:14-14:A"] ins = graph.nodes["chr:12-13:CG"] self.assertTrue(graph.has_edge(left, right)) self.assertTrue(graph.has_edge(left, ins)) self.assertTrue(graph.has_edge(ins, ref)) self.assertTrue(graph.has_edge(ref, right)) self.assertTrue(graph.has_edge(ins, alt)) self.assertTrue(graph.has_edge(alt, right)) self.assertFalse(graph.has_edge(left, ref)) self.assertFalse(graph.has_edge(left, alt)) self.assertFalse(graph.has_edge(ref, ins))
def test_ins_end(self): graph = graphContainer.GraphContainer() r = graph.add_refNode("chr", 18, 20) n = graph.add_altNode("chr", 10, 17, "ATCGATCG") graph.add_edge(n, r, ["foo"]) var = {"start": 8, "end": 7, "alt": "CCC"} variants.add_variants(graph, {n['name']: [var]}) graphUtils.remove_empty_nodes(graph) left = graph.nodes["chr:10-17:ATCGATCG"] right = graph.nodes["ref-chr:18-20"] alt = graph.nodes["chr:18-17:CCC"] self.assertEqual(len(graph.nodes), 3) self.assertTrue(graph.has_edge(left, right)) self.assertTrue(graph.has_edge(left, alt)) self.assertTrue(graph.has_edge(alt, right)) self.assertFalse(graph.has_edge(alt, left)) self.assertFalse(graph.has_edge(right, alt)) self.assertCountEqual(graph.get_edge(left['name'], right['name'])['sequences'], ["foo"]) self.assertCountEqual(graph.get_edge(alt['name'], right['name'])['sequences'], ["foo"])
def run(args): levels = [logging.WARNING, logging.INFO, logging.DEBUG] logging.basicConfig(format='%(message)s', level=levels[min(args.verbose, len(levels) - 1)]) graphDict = load_json(args.graph) graph = graphUtils.load_json(graphDict) if args.variants: varJson = load_json(args.variants) if "variants" not in varJson: raise Exception("No variants in variant JSON") varDict = varJson["variants"] else: varDict = graphDict.get("variants", {}) if not varDict: logging.warning("No variants in graph") print(varDict.keys()) variants.add_variants(graph, varDict) graphUtils.remove_empty_nodes(graph) json.dump(graph.json_dict(), args.output, sort_keys=True)
def test_var_begin(self): graph = graphContainer.GraphContainer() r = graph.add_refNode("chr", 1, 9) n = graph.add_altNode("chr", 10, 17, "ATCGATCG") graph.add_edge(r, n, ["foo"]) var = {"start": 0, "end": 0, "alt": "G"} variants.add_variants(graph, {n['name']: [var]}) graphUtils.remove_empty_nodes(graph) left = graph.nodes["ref-chr:1-9"] right = graph.nodes["chr:11-17:TCGATCG"] ref = graph.nodes["chr:10-10:A"] alt = graph.nodes["chr:10-10:G"] self.assertEqual(len(graph.nodes), 4) self.assertTrue(graph.has_edge(left, ref)) self.assertTrue(graph.has_edge(left, alt)) self.assertTrue(graph.has_edge(ref, right)) self.assertTrue(graph.has_edge(alt, right)) self.assertFalse(graph.has_edge(left, right)) self.assertFalse(graph.has_edge(ref, alt)) self.assertCountEqual(graph.get_edge(left['name'], ref['name'])['sequences'], ["foo"]) self.assertCountEqual(graph.get_edge(left['name'], alt['name'])['sequences'], ["foo"])
def convert_vcf(vcf, ref, target_regions=None, ref_node_padding=150, ref_node_max_length=1000, allele_graph=False, simplify=True, alt_paths=False, alt_splitting=False): """ Convert a single VCF file to a graph dictionary :param vcf: file name of the VCF file :param ref: reference FASTA file name :param target_regions: target region list :param ref_node_padding: padding / read length :param ref_node_max_length: maximum length before splitting a reference node :param allele_graph: add edges between any compatible allele pair, not just haplotypes from input :param simplify: simplify the graph :param alt_paths: Add all possible non-reference paths to the graph :param alt_splitting: also split long alt nodes (e.g. long insertions) :return: dictionary containing JSON graph """ graph = GraphContainer("Graph from %s" % vcf) indexed_vcf = tempfile.NamedTemporaryFile(delete=False, suffix=".vcf.gz") try: indexed_vcf.close() # noinspection PyUnresolvedReferences pysam.bcftools.view(vcf, "-o", indexed_vcf.name, "-O", "z", catch_stdout=False) # pylint: disable=no-member # noinspection PyUnresolvedReferences pysam.bcftools.index(indexed_vcf.name) # pylint: disable=no-member regions = map(parse_region, target_regions) if target_regions else [(None,)*3] for (chrom, start, end) in regions: if chrom is not None: logging.info(f"Starting work on region: {chrom}:{start}-{end}") try: vcfGraph = VCFGraph.create_from_vcf( ref, indexed_vcf.name, chrom, start, end, ref_node_padding, allele_graph) except NoVCFRecordsException: logging.info(f"Region {chrom}:{start}-{end} has no VCF records, skipping.") continue logging.info(f"CONSTRUCTED VCF GRAPH:\n{str(vcfGraph)}") chromGraph = vcfGraph.get_graph(allele_graph) if ref_node_max_length: graphUtils.split_ref_nodes(chromGraph, ref_node_max_length, ref_node_padding) if alt_splitting: graphUtils.split_alt_nodes(chromGraph, ref_node_max_length, ref_node_padding) if simplify: graphUtils.remove_empty_nodes(chromGraph) graphUtils.combine_nodes(chromGraph) # Disable edge label simplification for now. May use node-label short-cut later # graphUtils.remove_redundant_edge_labels(graph) chromGraph.check() graphUtils.add_graph(graph, chromGraph) finally: os.remove(indexed_vcf.name) graph.target_regions = target_regions or graph.get_reference_regions() graphUtils.add_source_sink(graph) graphUtils.add_ref_path(graph) if alt_paths: graphUtils.add_alt_paths(graph) graph.check() return graph.json_dict()