def renderout(tmplfname, data, groupby, nperhit): """Renders mturk output""" import web, web.template from nkutils import partitionByFunc from nkwebutils import NKStor, mystorify # de-multiplex and simplify data data = sum([demultiplex(row, nperhit) for row in data], []) # group by common key grouped, _ = partitionByFunc(data, lambda d: d[groupby]) results = [] Cls = NKStor # build up list of results for gkey, g in sorted(grouped.items()): # build up a list of common keys for this result group r = Cls(g[0]) for el in g: for k, v in r.items(): if el[k] != v: del r[k] # now create each individual sub-output r['outputs'] = [Cls(el) for el in g] results.append(r) #pprint(results) # render results renfunc = web.template.frender(tmplfname) s = renfunc(results) return s
def textaudiomainauto(txtfname, labelsfname, subfname): """A driver that takes a text and label file and creates subtitles. This tries to do it automatically, but doesn't work too well. The txt file should contain blank lines for major parts with no dialog. Lines starting with '(' are for signs in the video (no speech). The labels are as output from audacity's labeling feature: start time in seconds \t end time in seconds \t optional label (The labels are ignored.) """ # Read script and tokenize into chunks import re from nkutils import memoize, spark, partitionByFunc import numpy as np if 0: DLM = '([".,;:?!\n][\n]?)' DLMSPACE = '([ ".,;:?!\n][\n]?)' else: DLM = '([".,;:?!\n]+)' DLMSPACE = '([ ".,;:?!\n]+)' lines = [l.strip() for l in open(txtfname)] full = " ".join([l.strip() for l in open(txtfname) if l.strip()]) ntotallines = len(lines) # script = [l.strip() for l in open(txtfname) if not l.startswith('(')] allseqs, indices = partitionByFunc(lines, lambda s: "comment" if s.startswith("(") else "script") # indices is a dictionary of (outval, i) -> orig_i, which allows mapping results back. comments, script = allseqs["comment"], allseqs["script"] script = "\n".join(script) while "\n\n" in script: script = script.replace("\n\n", "\n") nlines = len(script.split("\n")) nchars = len(script) nwords = len(list(re.finditer(DLMSPACE, script))) tokens = list(re.finditer(DLM, script)) locs = set([0, len(script) - 1]) for t in tokens: locs.add(t.end()) locs = sorted(locs) toks = ["%s (%s)" % (t.group(), t.span()) for t in tokens] print "Read %d non-comment script lines (%d words, %d tokens, %d chars, %d locs): %s %s" % ( nlines, nwords, len(tokens), nchars, len(locs), toks[:4], locs[:4], ) # Read labels and compute speaking rates labels = [map(float, l.strip().split("\t")[:2]) for l in open(labelsfname)] llens = [b - a for a, b in labels] totalsecs = sum(llens) print "Read %d labels, %0.2f secs: %s" % (len(labels), totalsecs, zip(labels, llens)[:2]) wpm = nwords / (totalsecs / 60.0) spc = totalsecs / nchars print "Got %0.1f wpm, %0.4f secs per char" % (wpm, spc) # Define cost function and memoize it def costfunc(labelnum, start, end, zerocost=0.2, spc=spc): """Computes the cost (in secs) of assigning the given start and end locs to the label. The locs are specified w.r.t. to the 'locs' array. They can be identical. If the length is 0, the cost is 'zerocost'. Else, the cost is (length of label) - (length of chunk)*spc Notice that's signed: positive means label is longer than chunk, and vice versa. """ if start == end: return zerocost t = llens[labelnum] try: i, j = locs[start], locs[end] nchars = j - i nsecs = spc * nchars # print t, i, j, nchars, nsecs return t - nsecs except: return zerocost C = memoize(costfunc) # print C(0, 0, 0) # print C(0, 0, 1) # print C(0, 0, 2) # print C(0, 1, 2) # Initialize chunks M = len(locs) - 1 fac = M / float(len(llens)) chunks = [[min(int(i * fac), M), min(int((i + 1) * fac), M)] for i in range(len(llens))] print len(llens), len(chunks), llens[:5], chunks[:5] + chunks[-5:] if 0: print locs for a, b in zip(locs, locs[1:]): print "<%s>" % (script[a:b].strip()) sys.exit() costs = [C(i, a, b) for i, (a, b) in enumerate(chunks)] acosts = np.abs(np.array(costs)) best = [sum(acosts), chunks] iter = 0 from random import randint while iter < 10: iter += 1 n = np.argmax(acosts) mc = costs[n] which = randint(0, 1) print "On iter %d, total cost %0.3f, maxcost %0.3f at %d, shifting %d" % (iter, sum(acosts), mc, n, which) print " %s" % (chunks2str(chunks)) if mc < 0: # label shorter than chunk incr = 1 if which == 0 else -1 else: # label longer than chunk incr = 1 if which == 1 else -1 newchunks = shiftchunk(chunks, n, which, incr) costs = [C(i, a, b) for i, (a, b) in enumerate(newchunks)] acosts = np.abs(np.array(costs)) if sum(acosts) < best[0]: chunks = newchunks print chunks # now write output sf = srtfile(subfname) last = 0 # print full for idx, ((i, j), (t0, t1)) in enumerate(zip(chunks, labels)): if i == j: continue if i < 0 or j >= len(locs): continue s = script[locs[i] : locs[j]].strip() try: n = full.index(s.replace("\n", " ")) except Exception, e: print " ERROR: |%s|: %s" % (s, full[:200]) # TODO this is because of comments inside the s n = 1 # raise if n > 0: # we have some skipped stuff, so dump it all in a single line dif = 0.05 * (t0 - last) # so we're not touching boundaries sf(full[:n].strip(), last + dif, t0 - dif) # print '%d: %s' % ((full.index(s) if s in full else -1), s) full = full[n + len(s) :].strip() # see if we have any skipped things todo sf(s, t0, t1) last = t1
def textaudiomainauto(txtfname, labelsfname, subfname): """A driver that takes a text and label file and creates subtitles. This tries to do it automatically, but doesn't work too well. The txt file should contain blank lines for major parts with no dialog. Lines starting with '(' are for signs in the video (no speech). The labels are as output from audacity's labeling feature: start time in seconds \t end time in seconds \t optional label (The labels are ignored.) """ # Read script and tokenize into chunks import re from nkutils import memoize, spark, partitionByFunc import numpy as np if 0: DLM = '([".,;:?!\n][\n]?)' DLMSPACE = '([ ".,;:?!\n][\n]?)' else: DLM = '([".,;:?!\n]+)' DLMSPACE = '([ ".,;:?!\n]+)' lines = [l.strip() for l in open(txtfname)] full = ' '.join([l.strip() for l in open(txtfname) if l.strip()]) ntotallines = len(lines) #script = [l.strip() for l in open(txtfname) if not l.startswith('(')] allseqs, indices = partitionByFunc( lines, lambda s: 'comment' if s.startswith('(') else 'script') #indices is a dictionary of (outval, i) -> orig_i, which allows mapping results back. comments, script = allseqs['comment'], allseqs['script'] script = '\n'.join(script) while '\n\n' in script: script = script.replace('\n\n', '\n') nlines = len(script.split('\n')) nchars = len(script) nwords = len(list(re.finditer(DLMSPACE, script))) tokens = list(re.finditer(DLM, script)) locs = set([0, len(script) - 1]) for t in tokens: locs.add(t.end()) locs = sorted(locs) toks = ['%s (%s)' % (t.group(), t.span()) for t in tokens] print 'Read %d non-comment script lines (%d words, %d tokens, %d chars, %d locs): %s %s' % ( nlines, nwords, len(tokens), nchars, len(locs), toks[:4], locs[:4]) # Read labels and compute speaking rates labels = [map(float, l.strip().split('\t')[:2]) for l in open(labelsfname)] llens = [b - a for a, b in labels] totalsecs = sum(llens) print 'Read %d labels, %0.2f secs: %s' % (len(labels), totalsecs, zip(labels, llens)[:2]) wpm = nwords / (totalsecs / 60.0) spc = totalsecs / nchars print 'Got %0.1f wpm, %0.4f secs per char' % (wpm, spc) # Define cost function and memoize it def costfunc(labelnum, start, end, zerocost=0.2, spc=spc): """Computes the cost (in secs) of assigning the given start and end locs to the label. The locs are specified w.r.t. to the 'locs' array. They can be identical. If the length is 0, the cost is 'zerocost'. Else, the cost is (length of label) - (length of chunk)*spc Notice that's signed: positive means label is longer than chunk, and vice versa. """ if start == end: return zerocost t = llens[labelnum] try: i, j = locs[start], locs[end] nchars = j - i nsecs = spc * nchars #print t, i, j, nchars, nsecs return t - nsecs except: return zerocost C = memoize(costfunc) #print C(0, 0, 0) #print C(0, 0, 1) #print C(0, 0, 2) #print C(0, 1, 2) # Initialize chunks M = len(locs) - 1 fac = M / float(len(llens)) chunks = [[min(int(i * fac), M), min(int((i + 1) * fac), M)] for i in range(len(llens))] print len(llens), len(chunks), llens[:5], chunks[:5] + chunks[-5:] if 0: print locs for a, b in zip(locs, locs[1:]): print '<%s>' % (script[a:b].strip()) sys.exit() costs = [C(i, a, b) for i, (a, b) in enumerate(chunks)] acosts = np.abs(np.array(costs)) best = [sum(acosts), chunks] iter = 0 from random import randint while iter < 10: iter += 1 n = np.argmax(acosts) mc = costs[n] which = randint(0, 1) print 'On iter %d, total cost %0.3f, maxcost %0.3f at %d, shifting %d' % ( iter, sum(acosts), mc, n, which) print ' %s' % (chunks2str(chunks)) if mc < 0: # label shorter than chunk incr = 1 if which == 0 else -1 else: # label longer than chunk incr = 1 if which == 1 else -1 newchunks = shiftchunk(chunks, n, which, incr) costs = [C(i, a, b) for i, (a, b) in enumerate(newchunks)] acosts = np.abs(np.array(costs)) if sum(acosts) < best[0]: chunks = newchunks print chunks # now write output sf = srtfile(subfname) last = 0 #print full for idx, ((i, j), (t0, t1)) in enumerate(zip(chunks, labels)): if i == j: continue if i < 0 or j >= len(locs): continue s = script[locs[i]:locs[j]].strip() try: n = full.index(s.replace('\n', ' ')) except Exception, e: print ' ERROR: |%s|: %s' % (s, full[:200]) #TODO this is because of comments inside the s n = 1 #raise if n > 0: # we have some skipped stuff, so dump it all in a single line dif = 0.05 * (t0 - last) # so we're not touching boundaries sf(full[:n].strip(), last + dif, t0 - dif) #print '%d: %s' % ((full.index(s) if s in full else -1), s) full = full[n + len(s):].strip() # see if we have any skipped things todo sf(s, t0, t1) last = t1