import re import sys import argparse import traceback import math from decimal import Decimal from toollib.files import findNumber,ParameterParser from toollib.group import Group,run_grouping class ComputeListGroup(Group): def __init__(self, tup): super(ComputeListGroup, self).__init__(tup) self.lines = [] def add(self, chunks): self.lines.append(chunks[args.column]) def done(self): args.outfile.write(self.tup + [args.expression(self.lines)]) if __name__ == "__main__": pp = ParameterParser('User defined computation on a column', columns = 1, append = False, labels = [None]) pp.parser.add_argument('-e', '--expression', help='equation to call. use l[i] to indicate row i of the list') args = pp.parseArgs() if not any(args.labels): args.labels = [args.column_name + '_list_compute'] args = pp.getArgs(args) args.expression = eval('lambda l: '+ args.expression) run_grouping(args.infile, ComputeListGroup, args.group, args.ordered)
return ToUnixTime(datetime.strptime(dt, args.format)) def cToDateTime(dt): return ToDateTime(float(dt)).strftime(args.format) def cTimeOfDay(dt): return TimeOfDay(ToDateTime(float(dt))).strftime(args.format) if __name__ == "__main__": pp = ParameterParser('Convert timestamps', columns = '*', group = False, append = False, ordered = False) pp.parser.add_argument('-i', '--in-format') pp.parser.add_argument('-o', '--out-format') args = pp.parseArgs() args.append = True args = pp.getArgs(args) run_grouping(args.infile, TimestampGroup, [], False) # args.function = getattr(sys.modules[__name__], 'c'+args.function) # if not args.format: # if args.function == cTimeOfDay: # args.format = '%H:%M:%S.%f' # else: # args.format = '%Y-%m-%d_%H:%M:%S.%f' # # jdelim = args.delimiter if args.delimiter != None else ' ' # for line in args.infile: # val = line.rstrip().split(args.delimiter)[args.column] # res = args.function(val) # if args.append: # args.outfile.write('%s%s' % (line.rstrip(), jdelim)) # args.outfile.write(str(res)+'\n')
class PadGroup(Group): def __init__(self, tup): super(PadGroup, self).__init__(tup) self.present = set() def add(self, chunks): self.present.add(tuple(chunks[i] for i in args.columns)) args.outfile.write(chunks) def done(self): for element in args.elements: if element not in self.present: args.outfile.write(self.tup + list(element) + args.pad) if __name__ == "__main__": pp = ParameterParser('Generate additional rows to pad input', columns = '*', append = False, labels = False, ordered = False) pp.parser.add_argument('-e', '--elements', help='File containing list elements, one per line.') pp.parser.add_argument('-p', '--pad', nargs='+', default=['0']) args = pp.parseArgs() args.append = True args = pp.getArgs(args) elements = set() with FileReader(args.elements, args) as f: for chunks in f: elements.add(tuple(chunks)) args.elements = elements run_grouping(args.infile, PadGroup, args.group, ordered = False)
self.items = defaultdict(int) self.count = 0 def add(self, chunks): self.count += 1 val = tuple(chunks[c] for c in args.columns) if val in self.items: val_item = self.items[val] distance = sum(1 for item in self.items.itervalues() if item > val_item) # Find all items with indices larger than the last occurance of this item else: distance = -1 self.items[val] = self.count if args.append: args.outfile.write(chunks + [distance]) else: args.outfile.write(self.tup + [distance]) def done(self): pass if __name__ == "__main__": pp = ParameterParser('Compute the stack distance', columns = '*', labels = [None]) args = pp.parseArgs() if not any(args.labels): args.labels = ['_'.join(args.columns_names) + '_stack_distance'] args = pp.getArgs(args) run_grouping(args.infile, StackGroup, args.group, args.ordered)
def done(self): self.empty() if __name__ == "__main__": pp = ParameterParser('Cluster input using the DBSCAN algorithm', columns = 0, labels = [None], append = False) pp.parser.add_argument('--online', action='store_true', default=False, help='changes meaning of range parameter to a monotonically increasing position value') pp.parser.add_argument('-f', '--first', default='0', help='first key column') pp.parser.add_argument('-s', '--second', default='1', help='second key column (offline only)') pp.parser.add_argument('-r', '--range', default='2', help='column with distance') pp.parser.add_argument('-e', '--epsilon', type=float, default=0.5) pp.parser.add_argument('-m', '--min_samples', type=int, default=5) args = pp.parseArgs() if args.online: args.append = True if not any(args.labels): if args.online: args.labels = ['label'] else: args.labels = [args.infile.header.name(args.first), 'label'] args = pp.getArgs(args) args.first = args.infile.header.index(args.first) args.second = args.infile.header.index(args.second) args.range = args.infile.header.index(args.range) if args.online: args.label = 0 run_grouping(args.infile, OnlineDBSCANGroup, args.group, args.ordered) else: run_grouping(args.infile, OfflineDBSCANGroup, args.group, args.ordered)
import os import sys import argparse from toollib.files import findNumber,ParameterParser from toollib.group import Group,run_grouping class AccumulateGroup(Group): def __init__(self, tup): super(AccumulateGroup, self).__init__(tup) self.total = 0 def add(self, chunks): self.total += findNumber(chunks[args.column]) if args.append: args.outfile.write(chunks + [self.total]) else: args.outfile.write(self.tup + [self.total]) def done(self): pass if __name__ == "__main__": pp = ParameterParser('Accumulate the values of a column(s)', columns = 1, labels = [None]) args = pp.parseArgs() if not any(args.labels): args.labels = [args.column_name + '_accumulate'] args = pp.getArgs(args) run_grouping(args.infile, AccumulateGroup, args.group, args.ordered)
# if args.colourmap is not None: # args.ax.set_color_cycle(plt.get_cmap(args.colourmap[0])(np.linspace(0,1,int(args.colourmap[1])))) # Process sources in order for i,infile in enumerate(args.infiles): s = Source(infile) s.mapping = {v : infile.header.index(args.mapping[v][i]) for v in args.mapping if args.mapping[v][i] is not None} # s.geom = args.geom[i] # s.label = args.sourcelabels[i] # s.colour = args.colour[i] # s.shape = args.shape[i] # s.fill = args.fill[i] # s.alpha = args.alpha[i] # s.size = args.size[i] args.current = s run_grouping(infile, PlotGroup, args.group, False) infile.close() if args.xscale: args.ax.set_xscale(args.xscale) if args.xmajorticks: args.ax.set_xticks([fmt(x, args.xtype, args.xformat) for x in args.xmajorticks]) if args.xmajorticklabels: args.ax.set_xticklabels(args.xmajorticklabels) if args.xminorticks: args.ax.set_xticks([fmt(x, args.xtype, args.xformat) for x in args.xminorticks], minor = True) if args.xtickformat: args.ax.xaxis.set_major_formatter(tick_fmt(args.xtype, args.xtickformat)) if args.yscale: args.ax.set_yscale(args.yscale, nonposy='clip')
class KMinGroup(Group): def __init__(self, tup): super(KMinGroup, self).__init__(tup) self.mines = [] def add(self, chunks): heappush(self.mines, -findNumber(chunks[args.column])) if len(self.mines) > args.k: heappop(self.mines) def done(self): for v in reversed(sorted(self.mines)): args.outfile.write(self.tup + [-v]) if __name__ == "__main__": pp = ParameterParser('Compute minimum of column', columns = 1, labels = [None]) pp.parser.add_argument('-k', '--k', type = int, default = 1, help = 'find the k minimum values') args = pp.parseArgs() if not any(args.labels): args.labels = [args.column_name + '_min'] if args.append: args.labels = [] args = pp.getArgs(args) if args.k > 1: cls = KMinGroup else: cls = MinGroup run_grouping(args.infile, cls, args.group, args.ordered)
for val in (random.choice(self.row) for n in range(args.number)): if args.append: args.outfile.write(random.choice(self.rows[val])) else: args.outfile.write(self.tup + [val]) else: for val in self.sample(self.row, args.number): if args.append: i = random.choice(range(len(self.rows[val]))) args.outfile.write(self.rows[val][i]) del self.rows[val][i] else: args.outfile.write(self.tup + [val]) def sample(self, rows, number): if number >= len(rows): return rows else: return random.sample(rows, number) if __name__ == "__main__": pp = ParameterParser('Sample rows from file', columns = 1) pp.parser.add_argument('-r', '--replacement', action='store_true', default=False, help='with replacement') pp.parser.add_argument('-s', '--seed', type=int, default=12345) pp.parser.add_argument('-n', '--number', type=int, default=10, help='number of samples') args = pp.parseArgs() args = pp.getArgs(args) random.seed(args.seed) run_grouping(args.infile, SampleGroup, args.group, args.ordered)
class EntropyGroup(Group): def __init__(self, tup): super(EntropyGroup, self).__init__(tup) self.vals = [] def add(self, chunks): val = float(findNumber(chunks[args.column])) self.vals.append(val) def done(self): import numpy as np vals = np.array(self.vals) / np.sum(self.vals) from scipy.stats import entropy if args.pad is None or args.pad <= len(vals): e = entropy(vals, base = args.base) else: e = entropy(np.append(vals, [0.0] * (args.pad - len(vals))), base = args.base) args.outfile.write(self.tup + [e]) if __name__ == "__main__": pp = ParameterParser('Entropy of a column', columns = 1, append = False, labels = [None]) pp.parser.add_argument('-p', '--pad', type=int, default=None, help='pad to number of potential values') pp.parser.add_argument('--base', type=float, default=None, help='entropy base (default is e)') args = pp.parseArgs() if not any(args.labels): args.labels = [args.column_name + '_entropy'] args = pp.getArgs(args) run_grouping(args.infile, EntropyGroup, args.group, args.ordered)
import argparse from toollib.files import ParameterParser from toollib.group import Group,run_grouping class SetGroup(Group): def __init__(self, tup): super(SetGroup, self).__init__(tup) def add(self, chunks): if args.append: args.outfile.write(chunks) else: args.outfile.write(self.tup) self.add = self.noop def noop(self, chunks): pass def done(self): pass if __name__ == "__main__": pp = ParameterParser('Compute the set of strings from a column in files. Maintains first appearance order.', columns = '*', ordered = False) args = pp.parseArgs() args = pp.getArgs(args) if not args.append and args.infile.hasHeader: args.outfile.header.addCols(args.columns_names) run_grouping(args.infile, SetGroup, args.columns, False)
def __init__(self, tup): super(OccurGroup, self).__init__(tup) if 'first' in args.order: self.add = self.addFirst else: self.add = self.addNothing self.last = None def addFirst(self, chunks): args.outfile.write(chunks) if args.duplicate: self.last = chunks self.add = self.addNothing def addNothing(self, chunks): self.last = chunks def done(self): if self.last is not None and 'last' in args.order: args.outfile.write(self.last) if __name__ == "__main__": pp = ParameterParser('Output the first/last occurance of a group', columns = False, append = False, ordered = True) pp.parser.add_argument('-o', '--order', nargs='+', default=['first'], choices=['first', 'last']) pp.parser.add_argument('-d', '--duplicate', action='store_true', default=False, help='if order is first and last and there is only 1 group member, print same line twice') args = pp.parseArgs() args.append = True args = pp.getArgs(args) run_grouping(args.infile, OccurGroup, args.group, args.ordered)
position = (sum(r.itervalues()) + 1) * p ir = int(position) fr = Decimal(position - ir) count = 0 prev = None for key in sorted(r.iterkeys()): if count >= ir: break count += r[key] prev = key if prev is None: return key if fr == 0: # Whole value return prev elif count == ir: # Falls on the border between keys return prev * fr + key * (1 - fr) else: # Both median - 1 and median + 1 are same key return prev if __name__ == "__main__": pp = ParameterParser('Compute median of a column', columns = '*', append = False, labels = [None]) pp.parser.add_argument('-b', '--bin', default=None) args = pp.parseArgs() if not any(args.labels): args.labels = [cn + '_median' for cn in args.columns_names] args = pp.getArgs(args) args.bin = args.infile.header.index(args.bin) run_grouping(args.infile, MedianGroup, args.group, args.ordered)
self.maxes = [[] for c in args.columns] def add(self, chunks): for i,c in enumerate(args.columns): heappush(self.maxes[i], findNumber(chunks[c])) if len(self.maxes[i]) > args.k: heappop(self.maxes[i]) def done(self): for i,m in enumerate(self.maxes): self.maxes[i] = reversed(sorted(m)) for k in range(args.k): args.outfile.write(self.tup + [m[k] for m in self.maxes] + [ k+1 ]) if __name__ == "__main__": pp = ParameterParser('Compute maximum of columns', columns = '*', labels = [None]) pp.parser.add_argument('-k', '--k', type = int, default = 1, help = 'find the k maximum values') args = pp.parseArgs() if not any(args.labels): args.labels = [cn + '_max' for cn in args.columns_names] if args.append: args.labels = [] if args.k > 1: args.labels.append('k') args = pp.getArgs(args) if args.k > 1: run_grouping(args.infile, KMaxGroup, args.group, args.ordered) else: run_grouping(args.infile, MaxGroup, args.group, args.ordered)
diff = val - self.last if self.last != None else val - args.beginning if self.last != None or args.leading: if args.append: args.outfile.write(chunks + [str(diff)]) else: args.outfile.write(self.tup + [str(diff)]) args.ending = self.last = val self.chunks = chunks def done(self): if args.ending and args.trailing: if args.append: args.outfile.write(self.chunks + [str(args.ending - self.last)]) else: args.outfile.write(self.tup + [str(args.ending - self.last)]) if __name__ == "__main__": pp = ParameterParser('Compute the difference between subsequent elements in a column', columns = 1, labels = [None]) pp.parser.add_argument('--leading', action='store_true', default=False) pp.parser.add_argument('--trailing', action='store_true', default=False) args = pp.parseArgs() if not any(args.labels): args.labels = [args.column_name + '_interval'] args = pp.getArgs(args) args.beginning = None args.ending = None run_grouping(args.infile, IntervalGroup, args.group, args.ordered)
current = self.future.popleft() nearest = [abs(x - current) for x in self.past] + [abs(x - current) for x in self.future] nearest = sorted(nearest)[:args.k] args.outfile.write(self.tup + [current] + nearest) self.past.append(current) while len(self.past) > args.k: self.past.popleft() def done(self): while len(self.future) > 0: current = self.future.popleft() nearest = [abs(x - current) for x in self.past] + [abs(x - current) for x in self.future] nearest = sorted(nearest)[:args.k] args.outfile.write(self.tup + [current] + nearest) self.past.append(current) self.past.clear() if __name__ == "__main__": pp = ParameterParser('Compute the k-nearest values', columns = 1, labels = [None], append = False) pp.parser.add_argument('-k', '--k', type=int, default=1) args = pp.parseArgs() if not any(args.labels): args.labels = [args.column_name] + ['{0}_k{1}_nearest'.format(args.column_name, k+1) for k in range(args.k)] args = pp.getArgs(args) run_grouping(args.infile, KNearGroup, args.group, args.ordered)
from decimal import Decimal from toollib.files import findNumber,ParameterParser from toollib.group import Group,run_grouping from math import sqrt class SkewGroup(Group): def __init__(self, tup): super(SkewGroup, self).__init__(tup) self.vals = [] def add(self, chunks): val = float(findNumber(chunks[args.column])) def done(self): if args.pad is not None and args.pad > len(vals): vals = vals + [0.0] * (args.pad - len(vals)) vals from scipy.stats import skew args.outfile.write(self.tup + list(chisquare(vals) if args.expectation is None else chisquare(vals, f_exp = expect))) if __name__ == "__main__": pp = ParameterParser('Skew of the distribution', columns = 1, append = False, labels = [None]) pp.parser.add_argument('-p', '--pad', type=int, default=None, help='pad to number of potential values') args = pp.parseArgs() if not any(args.labels): args.labels = [args.column_name + '_skew'] args = pp.getArgs(args) args.expectation = args.infile.header.index(args.expectation) run_grouping(args.infile, SkewGroup, args.group, args.ordered)
def done(self): import numpy as np vals = np.array(self.vals) expect = np.array(self.expect) expect = expect / np.sum(expect) if args.invert: expect = (np.sum(expect) / expect) / np.sum(np.sum(expect) / expect) else: expect = expect / np.sum(expect) expect = expect * np.sum(vals) if args.pad is not None and args.pad > len(vals): vals = np.append(vals, [0.0] * (args.pad - len(vals))) expect = np.append(expect, [0.0] * (args.pad - len(expect))) from scipy.stats import chisquare args.outfile.write(self.tup + list(chisquare(vals) if args.expectation is None else chisquare(vals, f_exp = expect))) if __name__ == "__main__": pp = ParameterParser('Entropy of a column', columns = 1, append = False, labels = [None]) pp.parser.add_argument('-d', '--dist', choices = ['chisquare'], default='chisquare', help='distribution test to run') pp.parser.add_argument('-e', '--expectation', default=None, help='column containing expected distribution ratio') pp.parser.add_argument('-i', '--invert', action='store_true', default=False, help='invert the expected values (smaller values proportionally more likely)') pp.parser.add_argument('-p', '--pad', type=int, default=None, help='pad to number of potential values') args = pp.parseArgs() if not any(args.labels): args.labels = [args.column_name + '_disttest'] args = pp.getArgs(args) args.expectation = args.infile.header.index(args.expectation) run_grouping(args.infile, DistGroup, args.group, args.ordered)
super(ConvolveGroup, self).__init__(tup) self.vals = [] self.add = self._addall if args.append else self._add def _add(self, chunks): self.vals.append(findNumber(chunks[args.column])) def _addall(self, chunks): self.vals.append(chunks) def done(self): if args.append: for i,v in enumerate(np_convolve(args.function, [findNumber(val[args.column]) for val in self.vals], mode=args.mode)): if args.mode == args.outfile.write(self.vals[i] + [v]) else: for v in np_convolve(args.function, self.vals, mode=args.mode): args.outfile.write(self.tup + [v]) if __name__ == "__main__": pp = ParameterParser('Convolve on a column', columns = 1, labels = [None], append = False) pp.parser.add_argument('-m', '--mode', default='full', choices=['full', 'same', 'valid']) pp.parser.add_argument('-f', '--function', default=[Decimal('0.333'), Decimal('0.334'), Decimal('0.333')], type=Decimal, nargs='+', help='append result to columns') args = pp.parseArgs() if not any(args.labels): args.labels = [args.column_name + '_convolve'] args = pp.getArgs(args) args.append = False run_grouping(args.infile, ConvolveGroup, args.group, args.ordered)
parser.add_argument('-r', '--resample_file', type=argparse.FileType('r'), default=None, help='File to read resample points from') parser.add_argument('-e', '--resample_index', type=int, default=0) parser.add_argument('-x', '--xdata', type=int, default=0) parser.add_argument('-y', '--ydata', type=int, default=1) parser.add_argument('-g', '--group', nargs='+', type=int, default=[]) parser.add_argument('-d', '--delimiter', default=None) parser.add_argument('-o', '--ordered', action='store_true', default=False, help='input is sorted by group') args = parser.parse_args() if args.begin and args.resample_file: raise Exception('Cannot specify both file and begin parameters') elif args.resample_file: args.resample_values = [ Decimal(line.rstrip().split()[args.resample_index]) for line in args.resample_file ] args.resample_file.close() args.interpolatef = getattr(sys.modules[__name__], 'interp_' + args.interpolate) run_grouping(args.infile, ResampleGroup, args.group, args.delimiter, args.ordered)
if __name__ == "__main__": pp = ParameterParser('Plot maps of input files', infiles = '*', append = False, columns = '*', labels = [None], group = False, ordered = False) pp.parser.add_argument('-m', '--map', default='world', help='map to plot upon') pp.parser.add_argument('--size', default=[5, 5], nargs=2, type=int, help='size range of the markers') pp.parser.add_argument('--mode', default='auto', choices=['auto', 'markers', 'regions', 'text']) pp.parser.add_argument('--trigger', default='focus', choices=['none', 'focus', 'selection'], help='trigger for displaying tooltips') pp.parser.add_argument('--color-codes', default=[0, 1], nargs=2, type=int, help='range of values in color input') pp.parser.add_argument('--color-range', default=['#FF0000', '#00FF00'], nargs=2, help='range of colors to display') pp.parser.add_argument('--canvas', nargs=2, type=int, default=[500,300], help='canvas width and height in pixels') args = pp.parseArgs() if not any(args.labels): args.labels = args.columns_names args = pp.getArgs(args) print header print "['{0}'],".format("', '".join(args.labels)) for i,infile in enumerate(args.infiles): run_grouping(infile, MapGroup, [], False) infile.close() print footer.format(minSize=min(args.size), maxSize=max(args.size), map=args.map, mode=args.mode, trigger=args.trigger, minValue=min(args.color_codes), maxValue=max(args.color_codes), minColor=min(args.color_range), maxColor=max(args.color_range), width=args.canvas[0], height=args.canvas[1])
parser.add_argument('infile', nargs='?', default=sys.stdin) parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout) parser.add_argument('-c', '--column', default=0) parser.add_argument('-g', '--group', nargs='+', default=[]) parser.add_argument('-i', '--dist', nargs='+', default=['norm'], choices=DIST) parser.add_argument('-d', '--delimiter', default=None) parser.add_argument('-o', '--ordered', action='store_true', default=False, help='input is sorted by group') args = parser.parse_args() args.distf = [] if 'all' in args.dist: args.dist.remove('all') args.dist.extend(DIST) for i in args.dist: args.distf.append(getattr(scipy.stats, i)) args.infile = FileReader(args.infile) # Get the header from the input file if there is one args.inheader = args.infile.Header() # Setup output header args.outheader = Header() args.outheader.addCols(args.inheader.names(args.group)) args.outheader.addCol('_'.join(args.inheader.names(args.group)) + '_count') # Write output header args.outfile.write(args.outheader.value()) # Get columns for use in computation args.group = args.inheader.indexes(args.group) args.jdelim = args.delimiter if args.delimiter != None else ' ' run_grouping(args.infile, FitGroup, args.group, args.delimiter, args.ordered)
def _add(self, chunks): num = findNumber(chunks[args.column]) self.rows[num] += 1 self.total += num def addrow(self, chunks): num = findNumber(chunks[args.column]) self.rows[num] += 1 self.total += num self.fullrows[num].append(chunks) def donerow(self): for r in self.rows.iterkeys(): for row in self.fullrows[r]: args.outfile.write(row + [r / self.total]) def _done(self): for r,c in self.rows.iteritems(): for i in range(c): args.outfile.write(self.tup + [r / self.total]) if __name__ == "__main__": pp = ParameterParser('Compute fraction of column sum', columns = 1, labels = [None]) args = pp.parseArgs() if not any(args.labels): args.labels = [args.column_name + '_fraction'] args = pp.getArgs(args) run_grouping(args.infile, FractionGroup, args.group, args.ordered)
def __init__(self, tup): super(ShareGroup, self).__init__(tup) def add(self, chunks): first,second = [list(reversed(chunks[col].strip(args.separator).split(args.separator))) for col in args.columns] share = 0 for f,s in zip(first,second): if f == s: share += 1 else: break if args.append: args.outfile.write(chunks + [share]) else: args.outfile.write(self.tup + [share]) def done(self): pass if __name__ == "__main__": pp = ParameterParser('Compute postfix share of column', columns = '*', labels = [None]) pp.parser.add_argument('-s', '--separator', default='.') args = pp.parseArgs() if not any(args.labels): args.labels = ['_'.join(args.columns_names) + '_postfix_share'] args = pp.getArgs(args) if len(args.columns) != 2: raise Exception('Must specify exactly 2 columns!') run_grouping(args.infile, ShareGroup, args.group, args.ordered)
def noop(val): return val def quantize(val): return val.quantize(args.quantize) def binify(val): return (val / args.bin).to_integral_exact(rounding=ROUND_FLOOR) * args.bin if __name__ == "__main__": pp = ParameterParser('Compute pdf', columns = 1, labels = [None], group = False, ordered = False) pp.parser.add_argument('-q', '--quantize', type=Decimal, default=None, help='fixed exponent (e.g., 10, 1, 0.1)') pp.parser.add_argument('-s', '--significantDigits', type=int, default=None, help='number of significant digits') pp.parser.add_argument('-b', '--bin', type=Decimal, default=None, help='fit into bins, applies the formula: f(x) = floor(x / b) * b') args = pp.parseArgs() if not any(args.labels): args.labels = [args.column_name + '_round'] args = pp.getArgs(args) if args.significantDigits is not None: getcontext().prec = args.significantDigits if args.bin is not None: args.binF = binify else: args.binF = noop if args.quantize is not None: args.quantF = quantize else: args.quantF = noop run_grouping(args.infile, RoundGroup, [], False)
pass def readMapping(mapfile): mappings = defaultdict(dict) for chunks in mapfile: mappings[chunks[0]][chunks[2]] = chunks[1] return mappings if __name__ == "__main__": pp = ParameterParser('Replace column(s) with hashes for anonymization', columns = '*', append = False, ordered = False, group = False) pp.parser.add_argument('-m', '--mapping', default=None) pp.parser.add_argument('-r', '--reverse', action='store_true', default=False) args = pp.parseArgs() args.append = True args = pp.getArgs(args) if args.mapping and not args.reverse: args.mapping = FileWriter(args.mapping, None, args) if args.infile.hasHeader: args.mapping.header.addCols(['column', 'value', 'anonymized']) elif args.mapping: with FileReader(args.mapping, args) as mapfile: args.map = readMapping(mapfile) if args.reverse: group = DeanonGroup else: group = AnonGroup run_grouping(args.infile, group, [], False)
while count >= irs[ind]: if prev is None: yield key elif frs[ind] == 0 or count != irs[ind]: # Whole value yield prev else: # Falls on the border between keys yield prev * frs[ind] + key * (1 - frs[ind]) ind += 1 if ind >= len(pts): return count += vals[key] prev = key # Report remaining percentiles while ind < len(pts): yield key ind += 1 if __name__ == "__main__": pp = ParameterParser('Compute percentiles from a column', columns = '*', append = False, labels = [None]) pp.parser.add_argument('-b', '--bin', default=None) pp.parser.add_argument('-p', '--percentiles', nargs='+', type=Decimal, default=DEFAULT_PCT) args = pp.parseArgs() args.percentiles = sorted(args.percentiles) if not any(args.labels): args.labels = ['{0}_ptile{1}'.format(cn, p) for cn in args.columns_names for p in args.percentiles] args = pp.getArgs(args) args.bin = args.infile.header.index(args.bin) run_grouping(args.infile, PercentileGroup, args.group, args.ordered)
#!/usr/bin/env python import os import sys import argparse from toollib.files import ParameterParser from toollib.group import Group,run_grouping class UniqueGroup(Group): def __init__(self, tup): super(UniqueGroup, self).__init__(tup) self.sets = set() def add(self, chunks): val = tuple(chunks[c] for c in args.columns) self.sets.add(val) def done(self): args.outfile.write(self.tup + [len(self.sets)]) if __name__ == "__main__": pp = ParameterParser('Compute uniques counts of column(s)', columns = '*', append = False, labels = [None]) args = pp.parseArgs() if not any(args.labels): args.labels = ['_'.join(args.columns_names) + '_uniques'] args = pp.getArgs(args) run_grouping(args.infile, UniqueGroup, args.group, args.ordered)
self.delimiter = args.infile.delimiter if args.infile.delimiter else ' ' if not args.append and self.filename not in args.files: args.file_dict[self.filename] = openFile(self.filename, 'w') if args.infile.hasHeader: args.file_dict[self.filename].write(self.delimiter.join(map(str, args.infile.header.columns))+'\n') args.files.add(self.filename) def add(self, chunks): if self.filename not in args.file_dict: args.file_dict[self.filename] = openFile(self.filename, 'a') args.file_dict[self.filename].write(self.delimiter.join(chunks) + '\n') def done(self): pass if __name__ == "__main__": pp = ParameterParser('Split a file on column(s)', columns = 0) pp.parser.add_argument('-p', '--prefix', default='split-') pp.parser.add_argument('-f', '--fuzz', default=None, help='lambda specifying fuzz for group assignments') args = pp.parseArgs() args = pp.getArgs(args) args.file_dict = FileHandleDict() if args.fuzz: args.fuzz = eval(args.fuzz) args.files = set() run_grouping(args.infile, SplitGroup, args.group, args.ordered) args.file_dict.close_all()
x += f if __name__ == "__main__": # set up command line args parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,\ description='Resample the data points with a different frequency') parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin) parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout) parser.add_argument('-f', '--frequency', type=Decimal, default=Decimal('1')) parser.add_argument('-i', '--interpolate', choices=['linear', 'step'], default='linear') parser.add_argument('-b', '--begin', type=Decimal, default=None, help='value to begin resampling at') parser.add_argument('-t', '--terminate', type=Decimal, default=None, help='value to terminate resampling at') parser.add_argument('-r', '--resample_file', type=argparse.FileType('r'), default=None, help='File to read resample points from') parser.add_argument('-e', '--resample_index', type=int, default=0) parser.add_argument('-x', '--xdata', type=int, default=0) parser.add_argument('-y', '--ydata', type=int, default=1) parser.add_argument('-g', '--group', nargs='+', type=int, default=[]) parser.add_argument('-d', '--delimiter', default=None) parser.add_argument('-o', '--ordered', action='store_true', default=False, help='input is sorted by group') args = parser.parse_args() if args.begin and args.resample_file: raise Exception('Cannot specify both file and begin parameters') elif args.resample_file: args.resample_values = [Decimal(line.rstrip().split()[args.resample_index]) for line in args.resample_file] args.resample_file.close() args.interpolatef = getattr(sys.modules[__name__], 'interp_'+args.interpolate) run_grouping(args.infile, ResampleGroup, args.group, args.delimiter, args.ordered)
import os import sys import argparse from toollib.files import findNumber,ParameterParser from toollib.group import Group,run_grouping from numpy import corrcoef class CorrelationGroup(Group): def __init__(self, tup): super(CorrelationGroup, self).__init__(tup) self.vals = [] def add(self, chunks): self.vals.append([float(findNumber(chunks[i])) for i in args.columns]) def done(self): if len(self.vals) > 1 and len(self.vals[0]) > 1: v = corrcoef(self.vals, rowvar=0) for i,row in enumerate(v): for j in range(i): args.outfile.write(self.tup + [args.columns_names[i], args.columns_names[j], row[j]]) if __name__ == "__main__": pp = ParameterParser('Compute correlation of 2 or more columns', columns = '*', append = False) args = pp.parseArgs() args.labels = ['col1', 'col2', 'correlation'] args = pp.getArgs(args) run_grouping(args.infile, CorrelationGroup, args.group, args.ordered)
x = np.linspace(x[0], x[-1], args.granularity) else: x = np.linspace(args.range[0], args.range[-1], args.granularity) y = args.function(x, *popt) for xi, yi in zip(x, y): args.outfile.write(self.tup + [xi, yi]) if __name__ == "__main__": # set up command line args pp = ParameterParser('Compute polynomial to fit data', columns=0, labels=[None], append=False) pp.parser.add_argument('-x', default=0) pp.parser.add_argument('-y', default=1) pp.parser.add_argument('-f', '--function', required=True, help='lambda expression of function to fit') pp.parser.add_argument('-r', '--range', nargs=2, default=None, type=int) pp.parser.add_argument('-a', '--granularity', default=1000, type=int) args = pp.parseArgs() args = pp.getArgs(args) args.x = args.infile.header.index(args.x) args.y = args.infile.header.index(args.y) import numpy as np args.function = eval(args.function) run_grouping(args.infile, FitGroup, args.group, args.ordered)
outfile.write([ jdelim.join(u.tup + v.tup + map(str, res)) + '\n' ]) outfile.write(['Verdict:' + str(verdict) + '\n']) if __name__ == "__main__": # set up command line args pp = ParameterParser('Compute KS 2-sample', infiles='*', columns='*', append=False, labels=[None]) pp.parser.add_argument('-r', '--random', default=None, type=int, help='perform on r random subsamples') pp.parser.add_argument('-s', '--subsample', default=100, type=int, help='subsample size') args = pp.parseArgs() args = pp.getArgs(args) args.groups = [] for infile in args.infiles: run_grouping(infile, KSGroup, args.group, args.delimiter) KS_test(args.groups, args.outfile)