Пример #1
0
def renderout(tmplfname, data, groupby, nperhit):
    """Renders mturk output"""
    import web, web.template
    from nkutils import partitionByFunc
    from nkwebutils import NKStor, mystorify
    # de-multiplex and simplify data
    data = sum([demultiplex(row, nperhit) for row in data], [])
    # group by common key
    grouped, _ = partitionByFunc(data, lambda d: d[groupby])
    results = []
    Cls = NKStor
    # build up list of results
    for gkey, g in sorted(grouped.items()):
        # build up a list of common keys for this result group
        r = Cls(g[0])
        for el in g:
            for k, v in r.items():
                if el[k] != v:
                    del r[k]
        # now create each individual sub-output
        r['outputs'] = [Cls(el) for el in g]
        results.append(r)
    #pprint(results)
    # render results
    renfunc = web.template.frender(tmplfname)
    s = renfunc(results)
    return s
Пример #2
0
def textaudiomainauto(txtfname, labelsfname, subfname):
    """A driver that takes a text and label file and creates subtitles.
    This tries to do it automatically, but doesn't work too well.
    The txt file should contain blank lines for major parts with no dialog.
    Lines starting with '(' are for signs in the video (no speech).
    The labels are as output from audacity's labeling feature:
        start time in seconds \t end time in seconds \t optional label
    (The labels are ignored.)
    """
    # Read script and tokenize into chunks
    import re
    from nkutils import memoize, spark, partitionByFunc
    import numpy as np

    if 0:
        DLM = '([".,;:?!\n][\n]?)'
        DLMSPACE = '([ ".,;:?!\n][\n]?)'
    else:
        DLM = '([".,;:?!\n]+)'
        DLMSPACE = '([ ".,;:?!\n]+)'
    lines = [l.strip() for l in open(txtfname)]
    full = " ".join([l.strip() for l in open(txtfname) if l.strip()])
    ntotallines = len(lines)
    # script = [l.strip() for l in open(txtfname) if not l.startswith('(')]
    allseqs, indices = partitionByFunc(lines, lambda s: "comment" if s.startswith("(") else "script")
    # indices is a dictionary of (outval, i) -> orig_i, which allows mapping results back.
    comments, script = allseqs["comment"], allseqs["script"]
    script = "\n".join(script)
    while "\n\n" in script:
        script = script.replace("\n\n", "\n")
    nlines = len(script.split("\n"))
    nchars = len(script)
    nwords = len(list(re.finditer(DLMSPACE, script)))
    tokens = list(re.finditer(DLM, script))
    locs = set([0, len(script) - 1])
    for t in tokens:
        locs.add(t.end())
    locs = sorted(locs)
    toks = ["%s (%s)" % (t.group(), t.span()) for t in tokens]
    print "Read %d non-comment script lines (%d words, %d tokens, %d chars, %d locs): %s %s" % (
        nlines,
        nwords,
        len(tokens),
        nchars,
        len(locs),
        toks[:4],
        locs[:4],
    )
    # Read labels and compute speaking rates
    labels = [map(float, l.strip().split("\t")[:2]) for l in open(labelsfname)]
    llens = [b - a for a, b in labels]
    totalsecs = sum(llens)
    print "Read %d labels, %0.2f secs: %s" % (len(labels), totalsecs, zip(labels, llens)[:2])
    wpm = nwords / (totalsecs / 60.0)
    spc = totalsecs / nchars
    print "Got %0.1f wpm, %0.4f secs per char" % (wpm, spc)

    # Define cost function and memoize it
    def costfunc(labelnum, start, end, zerocost=0.2, spc=spc):
        """Computes the cost (in secs) of assigning the given start and end locs to the label.
        The locs are specified w.r.t. to the 'locs' array. They can be identical.
        If the length is 0, the cost is 'zerocost'.
        Else, the cost is (length of label) - (length of chunk)*spc
        Notice that's signed: positive means label is longer than chunk, and vice versa.
        """
        if start == end:
            return zerocost
        t = llens[labelnum]
        try:
            i, j = locs[start], locs[end]
            nchars = j - i
            nsecs = spc * nchars
            # print t, i, j, nchars, nsecs
            return t - nsecs
        except:
            return zerocost

    C = memoize(costfunc)
    # print C(0, 0, 0)
    # print C(0, 0, 1)
    # print C(0, 0, 2)
    # print C(0, 1, 2)

    # Initialize chunks
    M = len(locs) - 1
    fac = M / float(len(llens))
    chunks = [[min(int(i * fac), M), min(int((i + 1) * fac), M)] for i in range(len(llens))]
    print len(llens), len(chunks), llens[:5], chunks[:5] + chunks[-5:]
    if 0:
        print locs
        for a, b in zip(locs, locs[1:]):
            print "<%s>" % (script[a:b].strip())
        sys.exit()
    costs = [C(i, a, b) for i, (a, b) in enumerate(chunks)]
    acosts = np.abs(np.array(costs))
    best = [sum(acosts), chunks]
    iter = 0
    from random import randint

    while iter < 10:
        iter += 1
        n = np.argmax(acosts)
        mc = costs[n]
        which = randint(0, 1)
        print "On iter %d, total cost %0.3f, maxcost %0.3f at %d, shifting %d" % (iter, sum(acosts), mc, n, which)
        print "  %s" % (chunks2str(chunks))
        if mc < 0:  # label shorter than chunk
            incr = 1 if which == 0 else -1
        else:  # label longer than chunk
            incr = 1 if which == 1 else -1
        newchunks = shiftchunk(chunks, n, which, incr)
        costs = [C(i, a, b) for i, (a, b) in enumerate(newchunks)]
        acosts = np.abs(np.array(costs))
        if sum(acosts) < best[0]:
            chunks = newchunks
    print chunks
    # now write output
    sf = srtfile(subfname)
    last = 0
    # print full
    for idx, ((i, j), (t0, t1)) in enumerate(zip(chunks, labels)):
        if i == j:
            continue
        if i < 0 or j >= len(locs):
            continue
        s = script[locs[i] : locs[j]].strip()
        try:
            n = full.index(s.replace("\n", " "))
        except Exception, e:
            print "  ERROR: |%s|: %s" % (s, full[:200])
            # TODO this is because of comments inside the s
            n = 1
            # raise

        if n > 0:
            # we have some skipped stuff, so dump it all in a single line
            dif = 0.05 * (t0 - last)  # so we're not touching boundaries
            sf(full[:n].strip(), last + dif, t0 - dif)
        # print '%d: %s' % ((full.index(s) if s in full else -1), s)
        full = full[n + len(s) :].strip()
        # see if we have any skipped things todo
        sf(s, t0, t1)
        last = t1
Пример #3
0
def textaudiomainauto(txtfname, labelsfname, subfname):
    """A driver that takes a text and label file and creates subtitles.
    This tries to do it automatically, but doesn't work too well.
    The txt file should contain blank lines for major parts with no dialog.
    Lines starting with '(' are for signs in the video (no speech).
    The labels are as output from audacity's labeling feature:
        start time in seconds \t end time in seconds \t optional label
    (The labels are ignored.)
    """
    # Read script and tokenize into chunks
    import re
    from nkutils import memoize, spark, partitionByFunc
    import numpy as np
    if 0:
        DLM = '([".,;:?!\n][\n]?)'
        DLMSPACE = '([ ".,;:?!\n][\n]?)'
    else:
        DLM = '([".,;:?!\n]+)'
        DLMSPACE = '([ ".,;:?!\n]+)'
    lines = [l.strip() for l in open(txtfname)]
    full = ' '.join([l.strip() for l in open(txtfname) if l.strip()])
    ntotallines = len(lines)
    #script = [l.strip() for l in open(txtfname) if not l.startswith('(')]
    allseqs, indices = partitionByFunc(
        lines, lambda s: 'comment' if s.startswith('(') else 'script')
    #indices is a dictionary of (outval, i) -> orig_i, which allows mapping results back.
    comments, script = allseqs['comment'], allseqs['script']
    script = '\n'.join(script)
    while '\n\n' in script:
        script = script.replace('\n\n', '\n')
    nlines = len(script.split('\n'))
    nchars = len(script)
    nwords = len(list(re.finditer(DLMSPACE, script)))
    tokens = list(re.finditer(DLM, script))
    locs = set([0, len(script) - 1])
    for t in tokens:
        locs.add(t.end())
    locs = sorted(locs)
    toks = ['%s (%s)' % (t.group(), t.span()) for t in tokens]
    print 'Read %d non-comment script lines (%d words, %d tokens, %d chars, %d locs): %s %s' % (
        nlines, nwords, len(tokens), nchars, len(locs), toks[:4], locs[:4])
    # Read labels and compute speaking rates
    labels = [map(float, l.strip().split('\t')[:2]) for l in open(labelsfname)]
    llens = [b - a for a, b in labels]
    totalsecs = sum(llens)
    print 'Read %d labels, %0.2f secs: %s' % (len(labels), totalsecs,
                                              zip(labels, llens)[:2])
    wpm = nwords / (totalsecs / 60.0)
    spc = totalsecs / nchars
    print 'Got %0.1f wpm, %0.4f secs per char' % (wpm, spc)

    # Define cost function and memoize it
    def costfunc(labelnum, start, end, zerocost=0.2, spc=spc):
        """Computes the cost (in secs) of assigning the given start and end locs to the label.
        The locs are specified w.r.t. to the 'locs' array. They can be identical.
        If the length is 0, the cost is 'zerocost'.
        Else, the cost is (length of label) - (length of chunk)*spc
        Notice that's signed: positive means label is longer than chunk, and vice versa.
        """
        if start == end: return zerocost
        t = llens[labelnum]
        try:
            i, j = locs[start], locs[end]
            nchars = j - i
            nsecs = spc * nchars
            #print t, i, j, nchars, nsecs
            return t - nsecs
        except:
            return zerocost

    C = memoize(costfunc)
    #print C(0, 0, 0)
    #print C(0, 0, 1)
    #print C(0, 0, 2)
    #print C(0, 1, 2)

    # Initialize chunks
    M = len(locs) - 1
    fac = M / float(len(llens))
    chunks = [[min(int(i * fac), M),
               min(int((i + 1) * fac), M)] for i in range(len(llens))]
    print len(llens), len(chunks), llens[:5], chunks[:5] + chunks[-5:]
    if 0:
        print locs
        for a, b in zip(locs, locs[1:]):
            print '<%s>' % (script[a:b].strip())
        sys.exit()
    costs = [C(i, a, b) for i, (a, b) in enumerate(chunks)]
    acosts = np.abs(np.array(costs))
    best = [sum(acosts), chunks]
    iter = 0
    from random import randint
    while iter < 10:
        iter += 1
        n = np.argmax(acosts)
        mc = costs[n]
        which = randint(0, 1)
        print 'On iter %d, total cost %0.3f, maxcost %0.3f at %d, shifting %d' % (
            iter, sum(acosts), mc, n, which)
        print '  %s' % (chunks2str(chunks))
        if mc < 0:  # label shorter than chunk
            incr = 1 if which == 0 else -1
        else:  # label longer than chunk
            incr = 1 if which == 1 else -1
        newchunks = shiftchunk(chunks, n, which, incr)
        costs = [C(i, a, b) for i, (a, b) in enumerate(newchunks)]
        acosts = np.abs(np.array(costs))
        if sum(acosts) < best[0]:
            chunks = newchunks
    print chunks
    # now write output
    sf = srtfile(subfname)
    last = 0
    #print full
    for idx, ((i, j), (t0, t1)) in enumerate(zip(chunks, labels)):
        if i == j: continue
        if i < 0 or j >= len(locs): continue
        s = script[locs[i]:locs[j]].strip()
        try:
            n = full.index(s.replace('\n', ' '))
        except Exception, e:
            print '  ERROR: |%s|: %s' % (s, full[:200])
            #TODO this is because of comments inside the s
            n = 1
            #raise

        if n > 0:
            # we have some skipped stuff, so dump it all in a single line
            dif = 0.05 * (t0 - last)  # so we're not touching boundaries
            sf(full[:n].strip(), last + dif, t0 - dif)
        #print '%d: %s' % ((full.index(s) if s in full else -1), s)
        full = full[n + len(s):].strip()
        # see if we have any skipped things todo
        sf(s, t0, t1)
        last = t1