def write_agp(self, filename): sizes = self.sz agp = [] for scaffold, lines in self.iter_scaffold(): for a, b in pairwise(lines): cline = AGPLine.cline(scaffold, a.tig, sizes, a.oo) gline = AGPLine.gline(scaffold, a.gaps) agp.append(cline) agp.append(gline) a = lines[-1] cline = AGPLine.cline(scaffold, a.tig, sizes, a.oo) agp.append(cline) fw = open(filename, "w") for a in agp: print >> fw, a fw.close() reindex([filename, "--inplace"]) return filename
def write_AGP(self, filename, orientationguide={}, reindex=True): """ For each component, we have two overlaps: North and South. ======= |||| South ====(=================) Current BAC North |||| =============== For the case that says "Non-terminal", the overlap will not be considered. North-South would suggest a '+' orientation, South-North would suggest a '-' orientation. In most cases, unless the overlap involves phase1 BAC, the selected range will be shown as the brackets above - exclude North overlap, and include South overlap (aka the "left-greedy" rule). """ fw = must_open(filename, "w") for aid, bb in groupby(self.lines, key=lambda x: x.aid): bb = list(bb) north, south = bb aid = north.aid assert aid == south.aid aphase = north.aphase chr = north.chr size = north.asize ar = [chr, 0, 0, 0] northline = southline = None northrange = southrange = None # Warn if adjacent components do not have valid # overlaps if south.is_no_overlap: print >> sys.stderr, south # Most gaps, except telomeres occur twice, so only do the "North" if north.is_gap: bar = ar + self.get_agp_gap(north.bid) northline = "\t".join(str(x) for x in bar) else: if north.isTerminal(): northrange = north.astart, north.astop if south.is_gap: if south.bid == "telomere": bar = ar + self.get_agp_gap(south.bid) southline = "\t".join(str(x) for x in bar) else: if south.isTerminal(): southrange = south.astart, south.astop else: bar = ar + self.get_agp_gap("fragment") southline = "\t".join(str(x) for x in bar) # Determine the orientation and clear range for the current BAC clr = [1, size] orientation = sorientation = None if northrange: start, stop = northrange Lhang = start - 1 Rhang = size - stop orientation = '+' if Lhang < Rhang else '-' if north.bphase == 1 and north.bphase < aphase: if Lhang < Rhang: # North overlap at 5` clr[0] = start else: clr[1] = stop # Override left-greedy (also see below) else: if Lhang < Rhang: clr[0] = stop + 1 else: clr[1] = start - 1 if southrange: start, stop = southrange Lhang = start - 1 Rhang = size - stop sorientation = '+' if Lhang > Rhang else '-' # Override left-greedy (also see above) if aphase == 1 and aphase < south.bphase: if Lhang < Rhang: # South overlap at 5` clr[0] = stop + 1 else: clr[1] = start - 1 else: if Lhang < Rhang: clr[0] = start else: clr[1] = stop if orientation: if sorientation: try: assert orientation == sorientation, \ "Orientation conflicts:\n{0}\n{1}".format(north, south) except AssertionError as e: logging.debug(e) else: if sorientation: orientation = sorientation else: # Both overlaps fail to define orientation orientation = orientationguide.get(aid, "+") component_type = "D" if aphase in (1, 2) else "F" bar = ar + [component_type, aid, clr[0], clr[1], orientation] cline = "\t".join(str(x) for x in bar) if northline: print >> fw, northline print >> fw, cline if southline: print >> fw, southline fw.close() if reindex: from jcvi.formats.agp import reindex reindex([filename]) newagpfile = filename.replace(".agp", ".reindexed.agp") shutil.move(newagpfile, filename)
def write_AGP(self, filename, orientationguide={}): """ For each component, we have two overlaps: North and South. ======= |||| South ====(=================) Current BAC North |||| =============== For the case that says "Non-terminal", the overlap will not be considered. North-South would suggest a '+' orientation, South-North would suggest a '-' orientation. In most cases, unless the overlap involves phase1 BAC, the selected range will be shown as the brackets above - exclude North overlap, and include South overlap (aka the "left-greedy" rule). """ fw = must_open(filename, "w") for aid, bb in groupby(self.lines, key=lambda x: x.aid): bb = list(bb) north, south = bb aid = north.aid assert aid == south.aid aphase = north.aphase chr = north.chr size = north.asize ar = [chr, 0, 0, 0] northline = southline = None northrange = southrange = None # Warn if adjacent components do not have valid overlaps if south.is_no_overlap: print >> sys.stderr, south # Most gaps, except telomeres occur twice, so only do the "North" if north.is_gap: bar = ar + self.get_agp_gap(north.bid) northline = "\t".join(str(x) for x in bar) else: if north.isTerminal: northrange = north.astart, north.astop if south.is_gap: if south.bid == "telomere": bar = ar + self.get_agp_gap(south.bid) southline = "\t".join(str(x) for x in bar) else: if south.isTerminal: southrange = south.astart, south.astop else: bar = ar + self.get_agp_gap("fragment") southline = "\t".join(str(x) for x in bar) # Determine the orientation and clear range for the current BAC clr = [1, size] orientation = sorientation = None if northrange: start, stop = northrange Lhang = start - 1 Rhang = size - stop orientation = '+' if Lhang < Rhang else '-' if north.bphase == 1 and north.bphase < aphase: if Lhang < Rhang: # North overlap at 5` clr[0] = start else: clr[1] = stop # Override left-greedy (also see below) else: if Lhang < Rhang: clr[0] = stop + 1 else: clr[1] = start - 1 if southrange: start, stop = southrange Lhang = start - 1 Rhang = size - stop sorientation = '+' if Lhang > Rhang else '-' # Override left-greedy (also see above) if aphase == 1 and aphase < south.bphase: if Lhang < Rhang: # South overlap at 5` clr[0] = stop + 1 else: clr[1] = start - 1 else: if Lhang < Rhang: clr[0] = start else: clr[1] = stop if orientation: if sorientation: try: assert orientation == sorientation, \ "Orientation conflicts:\n{0}\n{1}".format(north, south) except AssertionError as e: logging.debug(e) else: if sorientation: orientation = sorientation else: # Both overlaps fail to define orientation orientation = orientationguide.get(aid, "+") component_type = "D" if aphase in (1, 2) else "F" bar = ar + [component_type, aid, clr[0], clr[1], orientation] cline = "\t".join(str(x) for x in bar) if northline: print >> fw, northline print >> fw, cline if southline: print >> fw, southline fw.close() reindex([filename, "--inplace"])
def estimategaps(args): """ %prog estimategaps input.bed Estimate sizes of inter-scaffold gaps. The AGP file generated by path() command has unknown gap sizes with a generic number of Ns (often 100 Ns). The AGP file `input.chr.agp` will be modified in-place. """ p = OptionParser(estimategaps.__doc__) p.add_option("--minsize", default=100, type="int", help="Minimum gap size") p.add_option("--maxsize", default=500000, type="int", help="Maximum gap size") p.add_option("--links", default=10, type="int", help="Only use linkage grounds with matchings more than") p.set_verbose(help="Print details for each gap calculation") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) inputbed, = args pf = inputbed.rsplit(".", 1)[0] agpfile = pf + ".chr.agp" bedfile = pf + ".lifted.bed" cc = Map(bedfile, scaffold_info=True) agp = AGP(agpfile) minsize, maxsize = opts.minsize, opts.maxsize links = opts.links verbose = opts.verbose outagpfile = pf + ".estimategaps.agp" fw = must_open(outagpfile, "w") for ob, components in agp.iter_object(): components = list(components) s = Scaffold(ob, cc) mlg_counts = s.mlg_counts gaps = [x for x in components if x.is_gap] gapsizes = [None] * len(gaps) # master for mlg, count in mlg_counts.items(): if count < links: continue g = GapEstimator(cc, agp, ob, mlg) g.compute_all_gaps(minsize=minsize, maxsize=maxsize, \ verbose=verbose) # Merge evidence from this mlg into master assert len(g.gapsizes) == len(gaps) for i, gs in enumerate(gapsizes): gg = g.gapsizes[i] if gs is None: gapsizes[i] = gg elif gg: gapsizes[i] = min(gs, gg) print gapsizes # Modify AGP i = 0 for x in components: if x.is_gap: x.gap_length = gapsizes[i] or minsize x.component_type = 'U' if x.gap_length == 100 else 'N' i += 1 print >> fw, x fw.close() reindex([outagpfile, "--inplace"])