def __init__( self, path, lastcol=None, lastrow=None, colmeta=None, rowmeta=None, eps_noise=False, ): self.tbl = table(path) self.bak = self.tbl.copy() # strip / save metadata if lastrow is not None: self.tbl.head(lastrow, invert=True) if lastcol is not None: self.tbl.head(lastcol, invert=True, transposed=True) self.row = self.tbl.rowheads[:] self.col = self.tbl.colheads[:] if eps_noise: self.tbl.float() self.tbl.apply_entries(lambda x: x + c_eps * random.random()) self.dat = self.tbl.table2array() # colmetas from file / table if colmeta is None: self.colmeta = None self.colmetaname = None else: self.colmeta = [] self.colmetaname = [] for x in colmeta: if os.path.exists(x): warn("Loading col metadata from file:", x) temp = col2dict(x, value=1) self.colmeta.append( [temp.get(k, c_str_none) for k in self.col]) self.colmetaname.append(path2name(x)) else: temp = self.bak.rowdict(x) self.colmeta.append( [temp.get(k, c_str_none) for k in self.col]) self.colmetaname.append(x) # rowmetas from file / table if rowmeta is None: self.rowmeta = None self.rowmetaname = None else: self.rowmeta = [] self.rowmetaname = [] for x in rowmeta: if os.path.exists(x): warn("Loading row metadata from file:", x) temp = col2dict(x, value=1) self.rowmeta.append( [temp.get(k, c_str_none) for k in self.row]) self.rowmetaname.append(path2name(x)) else: temp = self.bak.coldict(x) self.rowmeta.append( [temp.get(k, c_str_none) for k in self.row]) self.rowmetaname.append(x)
def main( ): args = get_args( ) tbl = table( args.table ) data = tbl.table2array( args.last_metadata ).transpose( ) dist = distance_matrix( data, args.dissimilarity ) if args.method == "manual": embedding, varexp, goodness = ordinate_cmdscale( dist ) else: embedding, varexp, goodness = ordinate_sklearn( dist, method=args.method ) fig = plt.figure() fig.set_size_inches( 10, 6 ) ax = plt.subplot( 111 ) xcoords = embedding[:, 0] ycoords = embedding[:, 1] shapes = ["o" for x in xcoords] if args.shapeby is not None: field, path = args.shapeby shapes = shapeize( tbl.row( field ), path, ax ) colors = ["black" for x in xcoords] if args.colorby is not None: field, path = args.colorby colors = colorize( tbl.row( field ), path, ax ) for x, y, c, s in zip( xcoords, ycoords, colors, shapes ): ax.scatter( x, y, color=c, marker=s ) ax.set_xlim( min( xcoords ), max( xcoords ) ) ax.set_ylim( min( ycoords ), max( ycoords ) ) mu.funcMargin( ax ) title = path2name( args.table ) if args.title is None else args.title ax.set_title( "%s | Goodness of fit = %.3f" % ( title, goodness ) ) ax.set_xlabel( "Dimension 1 (%.1f%%)" % (100 * varexp[0] ) ) ax.set_ylabel( "Dimension 2 (%.1f%%)" % (100 * varexp[1] ) ) mu.funcSetTickParams( ax ) for m in args.level_biplot: level_biplot( ax, embedding, tbl.row( m ) ) for m in args.quant_biplot: quant_biplot( ax, embedding, tbl.row( m ), m ) # Shrink current axis by 20% box = ax.get_position( ) ax.set_position( [box.x0, box.y0, box.width * 0.7, box.height] ) ax.legend( scatterpoints=1, fontsize=8, loc='center left', bbox_to_anchor=(1, 0.5), ) #plt.tight_layout( ) plt.savefig( args.outfile )
) args = parser.parse_args() # --------------------------------------------------------------- # load all data # --------------------------------------------------------------- dictTableData = {} # modified for faster looking up 4/2/2015 dictFeatureIndex = {} for iDex, strPath in enumerate(args.input): print >> sys.stderr, "Loading", iDex + 1, "of", len( args.input), ":", strPath aastrData = [] strColhead = path2name(strPath) with open(strPath) as fh: for astrItems in reader(fh): aastrData.append( [astrItems[args.key_col], astrItems[args.val_col]]), if args.strip_comments: aastrData = [astrRow for astrRow in aastrData if astrRow[0][0] != "#"] if args.use_headers: strColhead = aastrData[0][1] if args.strip_headers: aastrData = aastrData[1:] if args.key_pattern: aastrData = [ astrRow for astrRow in aastrData if re.search(args.key_pattern, astrRow[0]) ]
try: import openpyxl as xl except: die("This script requires the OPENPYXL module") # argument parsing (python argparse) parser = argparse.ArgumentParser() parser.add_argument("xlsx", help="") args = parser.parse_args() wb = xl.load_workbook(filename=args.xlsx) for ws in wb: basename = path2name(args.xlsx) sheet = ws.title sheet = re.sub("[^A-Za-z0-9]+", "_", sheet) newname = "{}.{}.tsv".format(basename, sheet) fh = open(newname, "w") ww = csv.writer(fh, csv.excel_tab) for row in ws.iter_rows(): row2 = [] for cell in row: value = cell.value if value is None: continue try: value = value.encode("utf-8") except:
say("Will load:", len(args.inputs), "gathered from command line") if args.file is not None: before = len(args.inputs) with open(args.file) as fh: for line in fh: args.inputs.append(line.strip()) after = len(args.inputs) say("Will load:", after - before, "additional files gathered from:", args.file) for iDex, strPath in enumerate(args.inputs): say(sys.stderr, "Loading", iDex + 1, "of", len(args.inputs), ":", strPath) aastrData = [] strColhead = path2name( strPath) if not args.use_full_names else os.path.split(strPath)[1] with open(strPath) as fh: for astrItems in reader(fh): if args.strip_comments and astrItems[0][0] == "#": continue aastrData.append( [astrItems[args.key_col], astrItems[args.val_col]]) if args.use_headers: strColhead = aastrData[0][1] if args.strip_headers: aastrData = aastrData[1:] if args.key_pattern: aastrData = [ astrRow for astrRow in aastrData if re.search(args.key_pattern, astrRow[0]) ]
def main(): args = get_args() fig = plt.figure() dims = Dimensions() df = DataFrame( args.table, lastcol=args.lastcol, lastrow=args.lastrow, colmeta=args.colmeta, rowmeta=args.rowmeta, eps_noise=args.eps_noise, ) # force labeling all features if args.force_labels: global c_max_lab c_max_lab = 1e6 # dim overrides vscale = 1 hscale = 1 if args.colmeta is not None and args.metascale: dims.colmeta_r = len(args.colmeta) if args.rowmeta is not None and args.metascale: dims.rowmeta_c = len(args.rowmeta) if args.vscale: old = dims.heat_r new = int(len(df.row) / 2.0) + len(df.row) % 2 new = max(new, 8) dims.heat_r = new vscale = new / float(old) if args.hscale: old = dims.heat_c new = int(len(df.col) / 2.0) + len(df.col) % 2 dims.heat_c = new new = max(new, 12) hscale = new / float(old) if args.cbar_extent is not None: dims.cbar_extent = args.cbar_extent if not args.debug: if args.title is None: dims.title_r = 0 # no tree axes if last sort is on 1) file or 2) metadata or 3) nothing if os.path.exists( args.colsort[-1] ) \ or re.search( "none|names|mean|metadata", args.colsort[-1] ): dims.coltree_r = 0 if os.path.exists( args.rowsort[-1] ) \ or re.search( "none|names|mean|metadata", args.rowsort[-1] ): dims.rowtree_c = 0 if args.colmeta is None: dims.colmeta_r = 0 if args.rowmeta is None: dims.rowmeta_c = 0 if len(df.col) > c_max_lab and not args.hscale: dims.colnames_r = 1 if len(df.row) > c_max_lab and not args.vscale: dims.rownames_c = 1 dims.update() # manual overrides for o in args.overrides: p, v = o.split(":") v = int(v) setattr(dims, p, v) dims.update() # define figure fig.set_size_inches( args.hstretch * dims.csize * args.grid_inches / dims.scale, args.vstretch * dims.rsize * args.grid_inches / dims.scale) # setup axes axes = HeatmapAxes(dims) # cluster cols Z = None for metric in args.colsort: Z = df.colsort(metric, linkage=args.linkage) if Z is not None: sch.dendrogram( Z, ax=axes.coltree, \ above_threshold_color="0.75", color_threshold=0, ) # cluster rows Z = None for metric in args.rowsort: Z = df.rowsort(metric, linkage=args.linkage) if Z is not None: sch.dendrogram( Z, ax=axes.rowtree, orientation="left", \ above_threshold_color="0.75", color_threshold=0, ) # apply transform df.transform(args.transform) # check limits poormin = False poormax = False vmin, vmax = (None, None) if args.limits is None else args.limits dmin, dmax = np.min(df.dat), np.max(df.dat) if vmin is None: vmin = dmin elif dmin < vmin: poormin = True n, p = acheck(df.dat, lambda x: x < vmin) warn("{} values ({:.2f}%) < vmin ({}), extreme: {}".format( n, 100 * p, vmin, dmin)) if vmax is None: vmax = dmax elif dmax > vmax: poormax = True n, p = acheck(df.dat, lambda x: x > vmax) warn("{} values ({:.2f}%) > vmax ({}), extreme: {}".format( n, 100 * p, vmax, dmax)) # add heatmap axes.heatmap.set_xlim(0, len(df.col)) axes.heatmap.set_ylim(0, len(df.row)) # imshow is similar to pcolorfast, but better centered if args.engine == "imshow": nr = len(df.row) nc = len(df.col) kwargs = { "interpolation": "none", "origin": "lower", "aspect": "auto", "extent": [0, nc, 0, nr] } pc = axes.heatmap.imshow(df.dat, cmap=args.cmap, vmin=vmin, vmax=vmax, **kwargs) # probably no reason to use this elif args.engine == "pcolorfast": pc = axes.heatmap.pcolorfast(df.dat, cmap=args.cmap, vmin=vmin, vmax=vmax) # use this if you want the individual heatmap cells to be editable shapes elif args.engine == "pcolormesh": pc = axes.heatmap.pcolormesh(df.dat, cmap=args.cmap, vmin=vmin, vmax=vmax) # add cmap bar fig.colorbar(pc, cax=axes.cbar) axes.cbar.set_ylabel( args.units if args.transform == "none" else \ "{}( {} )".format( args.transform, args.units ), size=c_font3 ) set_cbar_ticks(axes.cbar, pc.get_clim(), poormin=poormin, poormax=poormax) # add column metadata if df.colmeta is not None: colmeta_cmaps = axes.colmetaplot(df, args.colmeta_colors, args.max_levels) # add row metadata if df.rowmeta is not None: rowmeta_cmaps = axes.rowmetaplot(df, args.rowmeta_colors, args.max_levels) # column transition lines if "metadata" in args.colsort[-1]: args.colbreaks = args.colsort[-1] if args.colbreaks is not None: lastsort = args.colbreaks index = 0 if ":" not in lastsort else \ ( int( lastsort.split( ":" )[1] ) - 1 ) pos = [] for i, value in enumerate(df.colmeta[index]): if i > 0 and df.colmeta[index][i - 1] != value: pos.append(i) for i in pos: mu.vline(axes.colmeta, i, color="black") mu.vline(axes.heatmap, i, color=args.break_color) # add row transition lines if ending on a metasort if "metadata" in args.rowsort[-1]: args.rowbreaks = args.rowsort[-1] if args.rowbreaks is not None: lastsort = args.rowbreaks index = 0 if ":" not in lastsort else \ ( int( lastsort.split( ":" )[1] ) - 1 ) pos = [] for i, value in enumerate(df.rowmeta[index]): if i > 0 and df.rowmeta[index][i - 1] != value: pos.append(i) for i in pos: mu.hline(axes.rowmeta, i, color="black") mu.hline(axes.heatmap, i, color=args.break_color) # add generic grids if "x" in args.grid: for i in range(1, len(df.col)): mu.vline(axes.heatmap, i, color=args.break_color) if "y" in args.grid: for i in range(1, len(df.row)): mu.hline(axes.heatmap, i, color=args.break_color) # title if args.title is not None: axes.set_title(args.title) # add dots dots_added = [] if args.dots is not None: for p in args.dots: dots_added.append(add_dots(axes, df, p)) # legend L = mu.Legendizer(axes.legend, vscale=0.7 / vscale) # col sort legend L.subhead("Col sort") for m in args.colsort: if "metadata" in m: i = 0 if ":" in m: i = int(m.split(":")[1]) - 1 m = "metadata:" + df.colmetaname[i] L.element("_", color="0.75", label=m) # row sort legend L.subhead("Row sort") for m in args.rowsort: if "metadata" in m: i = 0 if ":" in m: i = int(m.split(":")[1]) - 1 m = "metadata:" + df.rowmetaname[i] L.element("_", color="0.75", label=m) # col metadata legend levelorder = {c_str_other: 1, c_str_none: 2} if df.colmeta is not None: for n, c in zip(df.colmetaname[::-1], colmeta_cmaps[::-1]): L.subhead(n) for l in sorted(c, key=lambda x: [levelorder.get(x, 0), x]): color = c[l] L.element("s", color=color, label=l) # row metadata legend if df.rowmeta is not None: for n, c in zip(df.rowmetaname[::-1], rowmeta_cmaps[::-1]): L.subhead(n) for l in sorted(c, key=lambda x: [levelorder.get(x, 0), x]): color = c[l] L.element("s", color=color, label=l) if len(dots_added) > 0: L.subhead("Dots") for p, kwargs in dots_added: marker = kwargs.get("marker", "o") kwargs = { k: v for k, v in kwargs.items() if k not in "s marker".split() } L.element(marker, label=path2name(p), **kwargs) # finalize L.draw() # cleanup if args.override_colnames is not "-": axes.collabel(df, args.collabel, scaled=args.hscale, path=args.override_colnames) if args.override_rownames is not "-": axes.rowlabel(df, args.rowlabel, scaled=args.vscale, path=args.override_rownames) if not args.debug: axes.clean() plt.subplots_adjust(wspace=0.3, hspace=0.3) plt.savefig(args.output, bbox_inches="tight") # logging if args.dump_colsort_order: with open(args.output + ".colsort", "w") as fh: for item in df.col: print >> fh, item if args.dump_rowsort_order: with open(args.output + ".rowsort", "w") as fh: for item in df.row: print >> fh, item
#! /usr/bin/env python import os, sys, re, glob, argparse from zopy.table2 import table from zopy.dictation import col2dict from zopy.utils import path2name dictMap = col2dict( sys.argv[1], key=0, value=1 ) tableData = table( sys.argv[2] ) tableData.apply_colheads( lambda x: path2name( x ) ) tableData.apply_colheads( lambda x: x.split( "." )[0] ) tableData.apply_colheads( lambda x: dictMap[x] ) tableData.dump( sys.argv[3] )
die("This script requires the OPENPYXL module") # argument parsing (python argparse) parser = argparse.ArgumentParser() parser.add_argument("tsv_files", nargs="+", help="") parser.add_argument("--outfile", default=None) args = parser.parse_args() wb = xl.Workbook() sheets = [] for i, p in enumerate(args.tsv_files): if i == 0: sheets.append(wb.active) else: sheets.append(wb.create_sheet()) sheets[-1].title = path2name(p) for p, ws in zip(args.tsv_files, sheets): for i, row in enumerate(iter_rows(p)): for j, val in enumerate(row): try: val = float(val) except: pass ws.cell(row=i + 1, column=j + 1, value=val) if args.outfile is not None: outfile = args.outfile elif len(args.tsv_files) == 1: outfile = "{}.xlsx".format(path2name(args.tsv_files[0])) else:
parser.add_argument('-i', '--input', nargs="+", help='One or more MetaPhlAn clade profiles') parser.add_argument('-o', '--output', help='Marker PCL file') parser.add_argument('-e', '--headers', action="store_true", help='File has headers') parser.add_argument('-g', '--grep', default=None, help='grep on clades') parser.add_argument('-x', '--extension_groups', default=1, type=int, help='.txt is 1, .cp.txt is 2, etc.') args = parser.parse_args() # load everything as nested dict [sample][marker]=value nesteddictData = {} for i, path in enumerate(args.input): print >> sys.stderr, "loading", i + 1, "of", len(args.input) name = path2name(path, args.extension_groups) nesteddictData[name] = funcLoadCladeProfile(path, grep=args.grep, headers=args.headers) # convert to a table, substituting 0 for missing values tableData = nesteddict2table(nesteddictData, empty=0) # transpose to get markers on the rows, unfloat, save as pcl tableData.transpose() tableData.unfloat() tableData.dump(args.output)
t.apply_entries(lambda x: float(x) if x is not None else None) # derived features aColors = [ plt.cm.Dark2(i / float(len(t.colheads))) for i, colhead in enumerate(t.colheads) ] aaData = [row for rowhead, row in t.iter_rows()] logmin, logmax = args.logmin, args.logmax # make plot fig = plt.figure() axes = plt.subplot(111) stepplot(axes, aaData, colors=aColors) # configure axes.set_yscale("log") axes.set_ylim(logmin, logmax) axes.xaxis.set_ticklabels(t.colheads, rotation=35, rotation_mode="anchor", ha="right") axes.set_title(utils.path2name(args.input)) axes.set_ylabel("Relative abundance") mu.funcGrid(axes, xaxis=False, color="gray", linestyle=":") mu.funcSetTickParams(axes) # done plt.tight_layout() plt.savefig(args.output)