Exemplo n.º 1
0
def CreateProtectedCounts(work):
    command = (
        "bash -c 'float-counts-to-histories <{0}/float.all | LC_ALL=C sort {1}|"
        " histories-to-null-counts >{0}/protected.all'".format(
            work, sort_mem_opt))
    log_file = work + "/log/create_protected_counts.log"
    RunCommand(command, log_file, args.verbose == 'true')
Exemplo n.º 2
0
def GetCountsSingleProcess(source_int_dir,
                           dest_count_dir,
                           ngram_order,
                           n,
                           max_mem,
                           num_splits=0):
    if num_splits == 0:
        int_counts_output = "/dev/null " + " ".join([
            "{0}/int.{1}.{2}".format(dest_count_dir, n, o)
            for o in range(2, ngram_order + 1)
        ])
    else:
        assert num_splits >= 1
        int_counts_output = '/dev/stdout | split-int-counts ' + \
            ' '.join(["{0}/int.{1}.split{2}".format(dest_count_dir, n, j)
                     for j in range(1, num_splits + 1)])

    command = "bash -c 'set -o pipefail; export LC_ALL=C; gunzip -c {source_int_dir}/{n}.txt.gz | "\
              "get-text-counts {limit_unk_history} {ngram_order} | sort {mem_opt}| uniq -c | "\
              "get-int-counts {int_counts_output}'".format(source_int_dir=source_int_dir,
                                                           n=n, ngram_order=ngram_order,
                                                           limit_unk_history="--limit-unk-history" if args.limit_unk_history == 'true' else "",
                                                           mem_opt="--buffer-size={0}".format(max_mem) if max_mem != '' else '',
                                                           int_counts_output=int_counts_output)
    log_file = "{dest_count_dir}/log/get_counts.{n}.log".format(
        dest_count_dir=dest_count_dir, n=n)
    RunCommand(command, log_file, args.verbose == 'true')
Exemplo n.º 3
0
def EnforceMinCounts(dest_count_dir, formatted_min_counts, ngram_order,
                     num_train_sets, j):
    inputs = ' '.join([
        "{0}/int.{1}.split{2}".format(dest_count_dir, n, j)
        for n in range(1, num_train_sets + 1)
    ])
    outputs = ' '.join([
        ' '.join([
            '{0}/int.{1}.split{2}.{3}'.format(dest_count_dir, n, j, o)
            for o in range(2, ngram_order + 1)
        ]) for n in range(1, num_train_sets + 1)
    ])
    # e.g. suppose j is 2 and ngram_order is 4, outputs would be as follows
    # [assuming brace expansion].:
    # outputs = dir/int.1.split2.{2,3,4} dir/int.2.split2.{2,3,4} ...
    #    dir/int.{num_train_sets}.split2.{2,3,4}

    command = "int-counts-enforce-min-counts {ngram_order} {formatted_min_counts} {inputs} "\
              "{outputs}".format(
                      ngram_order=ngram_order, formatted_min_counts=formatted_min_counts,
                      inputs=inputs, outputs=outputs, j=j)

    log_file = '{0}/log/enforce_min_counts.{1}.log'.format(dest_count_dir, j)

    RunCommand(command, log_file, args.verbose == 'true')
Exemplo n.º 4
0
def MergeCounts(order):
    # merge counts of the specified order > 1.
    assert order > 1
    command = "merge-counts"
    for n in range(1, num_train_sets + 1):
        command += " {counts}/int.{train_set}.{order},{scale}".format(
            counts=args.count_dir,
            train_set=n,
            order=order,
            scale=train_set_scale[n])
    if args.fold_dev_into_int is not None:
        command += " {counts}/int.dev.{order},{scale}".format(
            counts=args.count_dir,
            order=order,
            scale=train_set_scale[args.fold_dev_into_int])

    # for orders less than the highest order, we also have to include the
    # discount counts from the one-higher order.  there is no scale here, so
    # the program will expect general-counts, not int-counts.
    if order < ngram_order:
        command += " {work}/discount.{order}".format(work=args.work_dir,
                                                     order=order)
    # the output gets redirected to the output file.
    command += " >{work}/merged.{order}".format(work=args.work_dir,
                                                order=order)
    log_file = "{0}/log/merge_counts.{1}.log".format(args.work_dir, order)
    RunCommand(command, log_file, args.verbose == 'true')
Exemplo n.º 5
0
def MergeDevData(dest_count_dir, ngram_order):
    command = ("merge-int-counts " + ' '.join([
        dest_count_dir + "/int.dev." + str(n)
        for n in range(2, ngram_order + 1)
    ]) + ">{0}/int.dev".format(dest_count_dir))
    log_file = dest_count_dir + '/log/merge_dev_counts.log'
    RunCommand(command, log_file, args.verbose == 'true')
Exemplo n.º 6
0
def MergeCountsOrder1():
    # This function merges the order-1 discount counts across all splits.
    command = ("merge-counts " +
               " ".join(["{0}/{1}/discount.1".format(split_work_dir, s)
                        for s in range(1, args.num_splits + 1)]) +
               " >{0}/discount.1".format(args.work_dir))
    log_file = "{0}/log/merge_counts_order1.log".format(args.work_dir)
    RunCommand(command, log_file, args.verbose == 'true')
Exemplo n.º 7
0
def MergeAllOrders():
    command = ("merge-float-counts " + " ".join([
        "{0}/float.{1}".format(args.work_dir, n)
        for n in range(1, ngram_order + 1)
    ]) + ">{0}/float.all".format(args.work_dir))
    log_file = "{0}/log/merge_all_orders.log".format(args.work_dir)
    RunCommand(command, log_file, args.verbose == 'true')
    ParseNumNgrams(args.work_dir, log_file)
Exemplo n.º 8
0
def DiscountCountsOrder1Backward():
    command = (
        "discount-counts-1gram-backward {work}/discount.1 {work}/float.1 "
        "{work}/float_derivs.1 {work}/discount_derivs.1".format(
            work=args.work_dir))
    log_file = "{0}/log/discount_counts_order1_backward.log".format(
        args.work_dir)
    RunCommand(command, log_file, args.verbose == 'true')
Exemplo n.º 9
0
def MergeAllSplits():
    # merges the float.all acros all splits.
    command = ("merge-float-counts " +
               " ".join(["{0}/{1}/float.all".format(split_work_dir, split_index)
                        for split_index in range(1, args.num_splits + 1)]) +
               ">{0}/float.all".format(args.work_dir))
    log_file = "{0}/log/merge_all_splits.log".format(args.work_dir)
    RunCommand(command, log_file, args.verbose == 'true')
Exemplo n.º 10
0
def DiscountCounts(split_index, order):
    # discount counts of the specified order > 1.
    assert order > 1
    this_split_work = "{0}/{1}".format(split_work_dir, split_index)
    command = "discount-counts {d1} {d2} {d3} {d4} {sdir}/merged.{order} {sdir}/float.{order} {sdir}/discount.{orderm1} ".format(
            d1=d1[order], d2=d2[order], d3=d3[order], d4=d4[order],
            sdir=this_split_work, order=order, orderm1=order - 1)
    log_file = "{0}/log/discount_counts.{1}.{2}.log".format(args.work_dir,
                                                            split_index, order)
    RunCommand(command, log_file, args.verbose == 'true')
Exemplo n.º 11
0
def SumFloatDerivsOrder1():
    # this has to be called before DiscountCountsOrder1Backward, to sum up the
    # different parts of the final float-count derivatives w.r.t. the unigram counts, from all the
    # individual split directories.
    command = ("sum-float-derivs {0}/float.1 ".format(args.work_dir) +
               " ".join(["{0}/{1}/float_derivs.1".format(split_work_dir, s)
                        for s in range(1, args.num_splits + 1)]) +
               " >{0}/float_derivs.1".format(args.work_dir))
    log_file = "{0}/log/sum_float_counts_order1.log".format(args.work_dir)
    RunCommand(command, log_file, args.verbose == 'true')
Exemplo n.º 12
0
def MergeCountsOrder1Backward():
    # This function merges the order-1 discount counts across all splits.
    # we pipe it to /dev/null because it writes a newline to stdout (this is
    # to terimate the derivs w.r.t. the scaling factors, which are written to
    # stdout but in this case are empty.
    command = ("merge-counts-backward {0}/discount.1 {0}/discount_derivs.1 ".format(
               args.work_dir) +
               " ".join(["{0}/{1}/discount.1 {0}/{1}/discount_derivs.1".format(split_work_dir, s)
                        for s in range(1, args.num_splits + 1)]) +
               ">/dev/null")
    log_file = "{0}/log/merge_counts_order1_backward.log".format(args.work_dir)
    RunCommand(command, log_file, args.verbose == 'true')
Exemplo n.º 13
0
def MergeAllOrders(split_index):
    this_split_work = "{0}/{1}".format(split_work_dir, split_index)
    # this merges all the orders of float-counts in each of the split
    # directories.  Note that for unigram, it takes the merged-across-all-splits
    # counts from the top-level work-dir, not the split work-dir.
    command = ("merge-float-counts " +
               " ".join(["{0}/float.{1}".format(this_split_work if n > 1 else args.work_dir, n)
                        for n in range(1, ngram_order + 1)])
               + ">{0}/float.all".format(this_split_work))
    log_file = "{0}/log/merge_all_orders.{1}.log".format(args.work_dir, split_index)
    RunCommand(command, log_file, args.verbose == 'true')
    ParseNumNgrams(this_split_work, log_file)
Exemplo n.º 14
0
def DiscountCounts(order):
    # discount counts of the specified order > 1.
    assert order > 1
    command = "discount-counts {d1} {d2} {d3} {d4} {work}/merged.{order} {work}/float.{order} {work}/discount.{orderm1} ".format(
        d1=d1[order],
        d2=d2[order],
        d3=d3[order],
        d4=d4[order],
        work=args.work_dir,
        order=order,
        orderm1=order - 1)
    log_file = "{0}/log/discount_counts.{1}.log".format(args.work_dir, order)
    RunCommand(command, log_file, args.verbose == 'true')
Exemplo n.º 15
0
def CreateInitialWorkDir():
    # Creates float.all, stats.all, and protected.all in work_dir/step
    work0dir = work_dir + "/step0"
    # create float.all
    if not os.path.isdir(work0dir + "/log"):
        os.makedirs(work0dir + "/log")
    SoftLink(args.lm_dir_in + "/num_ngrams", work0dir + "/num_ngrams")
    if num_splits is None:
        SoftLink(args.lm_dir_in + "/float.all", work0dir + "/float.all")
    else:
        splits_star = ' '.join([
            args.lm_dir_in + "/float.all." + str(n)
            for n in range(1, num_splits + 1)
        ])
        command = "merge-float-counts " + splits_star + " >{0}/float.all".format(
            work0dir)
        log_file = work0dir + "/log/merge_initial_float_counts.log"
        RunCommand(command, log_file, args.verbose == 'true')

    # create protected.all
    CreateProtectedCounts(work0dir)

    stats_star = ' '.join([
        "{0}/stats.{1}".format(work0dir, n) for n in range(1, ngram_order + 1)
    ])

    # create stats.{1,2,3..}
    # e.g. command = 'float-counts-to-float-stats 20000 foo/work/step0/stats.1 '
    #                'foo/work/step0/stats.2 <foo/work/step0/float.all'
    command = ("float-counts-to-float-stats {0} ".format(num_words) +
               stats_star + " <{0}/float.all".format(work0dir))
    log_file = work0dir + "/log/float_counts_to_float_stats.log"
    RunCommand(command, log_file, args.verbose == 'true')
    command = "merge-float-counts {0} > {1}/stats.all".format(
        stats_star, work0dir)
    log_file = work0dir + "/log/merge_float_counts.log"
    RunCommand(command, log_file, args.verbose == 'true')
    for f in stats_star.split():
        os.remove(f)
Exemplo n.º 16
0
def MergeCounts(dest_count_dir, num_jobs, n, o):
    if num_jobs > 1:
        command = ('merge-int-counts ' + ' '.join([
            '{0}/int.{1}.split{2}.{3}'.format(dest_count_dir, n, j, o)
            for j in range(1, num_jobs + 1)
        ]) + '>{0}/int.{1}.{2}'.format(dest_count_dir, n, o))
        log_file = '{0}/log/merge_counts.{1}.{2}.log'.format(
            dest_count_dir, n, o)
        RunCommand(command, log_file, args.verbose == 'true')
    else:
        assert num_jobs == 1
        # we can just move the file if num-jobs == 1.
        try:
            os.remove('{0}/int.{1}.{2}'.format(dest_count_dir, n, o))
        except:
            pass
        os.rename('{0}/int.{1}.split1.{2}'.format(dest_count_dir, n, o),
                  '{0}/int.{1}.{2}'.format(dest_count_dir, n, o))
Exemplo n.º 17
0
def RunEmStep(work_in, work_out):
    # set float_star = 'work_out/float.1 work_out/float.2 ...'
    float_star = " ".join([
        '{0}/float.{1}'.format(work_out, n) for n in range(1, ngram_order + 1)
    ])

    command = (
        'float-counts-estimate {num_words} {work_in}/float.all {work_in}/stats.all '
        '{float_star}'.format(num_words=num_words,
                              work_in=work_in,
                              float_star=float_star))
    log_file = work_out + "/log/float_counts_estimate.log"
    try:
        output = GetCommandStdout(command, log_file, args.verbose == 'true')
        # the stdout of this program will be something like:
        # 1.63388e+06 -7.39182e+06 10.5411 41.237 49.6758
        # representing: total-count, total-like, and for each order, the like-change
        # for that order.
        a = output.split()
        tot_count = float(a[0])
        tot_like = float(a[1])
        like_change = 0.0
        global final_logprob_per_word
        final_logprob_per_word = tot_like / tot_count
        for i in range(2, len(a)):  # for each n-gram order
            like_change += float(a[i])
        like_change_per_word = like_change / tot_count
    except Exception as e:
        ExitProgram("error running command '{0}', error is '{1}'".format(
            command, repr(e)))

    command = 'merge-float-counts {0} >{1}/float.all'.format(
        float_star, work_out)
    log_file = work_out + '/log/merge_float_counts.log'
    RunCommand(command, log_file, args.verbose == 'true')
    for f in float_star.split():
        os.remove(f)
    # soft-link work_out/stats.all to work_in/stats.all
    SoftLink(work_in + "/stats.all", work_out + "/stats.all")
    # soft-link work_out/protected.all to work_in/protected.all
    SoftLink(work_in + "/protected.all", work_out + "/protected.all")
    SoftLink(work_in + "/num_ngrams", work_out + "/num_ngrams")
    return like_change_per_word
Exemplo n.º 18
0
def GetNames(text_dir, int_dir):
    command = "get_names.py {0} > {1}/names".format(text_dir, int_dir)
    log_file = "{int_dir}/log/get_names.log".format(int_dir=int_dir)
    RunCommand(command, log_file)
Exemplo n.º 19
0
def GetCountsMultiProcess(source_int_dir,
                          dest_count_dir,
                          ngram_order,
                          n,
                          num_proc,
                          max_mem,
                          num_splits=0):
    try:
        file_size = os.path.getsize('{0}/{1}.txt.gz'.format(source_int_dir, n))
    except:
        ExitProgram('get_counts.py: error getting file size of '
                    '{0}/{1}.txt.gz'.format(source_int_dir, n))

    if IsCygwin() or num_proc <= 1 or file_size < 1000000:
        if num_proc > 1 and file_size >= 1000000:
            # it's only because of Cygwin that we're not using multiple
            # processes this merits a warning.
            print(
                "get_counts.py: cygwin platform detected so named pipes won't work; "
                "using a single process (will be slower)")
        return GetCountsSingleProcess(source_int_dir, dest_count_dir,
                                      ngram_order, n, max_mem, num_splits)

    if num_splits == 0:
        int_counts_output = "/dev/null " + " ".join([
            "{0}/int.{1}.{2}".format(dest_count_dir, n, o)
            for o in range(2, ngram_order + 1)
        ])
    else:
        assert num_splits >= 1
        int_counts_output = '/dev/stdout | split-int-counts ' + \
            ' '.join(["{0}/int.{1}.split{2}".format(dest_count_dir, n, j)
                     for j in range(1, num_splits + 1)])

    try:
        # we want a temporary directory on a local file system
        # for
        tempdir = tempfile.mkdtemp()
    except Exception as e:
        ExitProgram("Error creating temporary directory: " + str(e))

    # This has several pipes for the internal processing that write to and read
    # from other internal pipes; and we can't do this using '|' in the shell, we
    # need to use mkfifo.  This does not work properly on cygwin.

    log_dir = "{dest_count_dir}/log".format(dest_count_dir=dest_count_dir)
    [
        os.remove(x)
        for x in glob.glob("{log_dir}/.{n}.*.error".format(log_dir=log_dir,
                                                           n=n))
    ]

    log_file = "{log_dir}/get_counts.{n}.log".format(log_dir=log_dir, n=n)

    test_command = "bash -c 'set -o pipefail; (echo a; echo b) | "\
        "distribute-input-lines /dev/null /dev/null'"
    # We run the following command just to make sure distribute-input-lines is
    # on the path and compiled, since we get hard-to-debug errors if it fails.
    RunCommand(test_command, log_file)

    if max_mem == '':
        mem_opt = ''
    else:
        mem_opt = "--buffer-size={0}".format(
            DivideMemory(max_mem, num_proc + 1))
    # we use "bash -c '...'" to make sure it gets run in bash, since
    # for example 'set -o pipefail' would only work in bash.
    command = (
        "bash -c 'set -o pipefail; set -e; export LC_ALL=C; mkdir -p {0}; ".
        format(tempdir) + ''.join(
            ['mkfifo {0}/{1}; '.format(tempdir, p) for p in range(num_proc)]) +
        'trap "rm -r {0}" SIGINT SIGKILL SIGTERM EXIT; '.format(tempdir) +
        'gunzip -c {0}/{1}.txt.gz | distribute-input-lines '.format(
            source_int_dir, n) +
        ' '.join(['{0}/{1}'.format(tempdir, p) for p in range(num_proc)]) +
        '& ' + 'sort -m {0} '.format(mem_opt) + ' '.join([
            '<(get-text-counts {4} {0} <{1}/{2} | sort {3} || touch {5}/.{6}.{2}.error)'
            .format(
                ngram_order, tempdir, p, mem_opt, "--limit-unk-history"
                if args.limit_unk_history == 'true' else "", log_dir, n)
            for p in range(num_proc)
        ]) + '| uniq -c | get-int-counts {0}'.format(int_counts_output) + "'"
    )  # end the quote from the 'bash -c'.

    RunCommand(command, log_file, args.verbose == 'true')

    if len(glob.glob("{log_dir}/.{n}.*.error".format(log_dir=log_dir,
                                                     n=n))) > 0:
        ExitProgram(
            "Something went wrong for the get-text-counts or sort command for training set {n}."
            .format(n=n))
Exemplo n.º 20
0
word_counts_dir = os.path.join(work_dir, 'word_counts')
if os.system("validate_text_dir.py " + args.text_dir) != 0:
    sys.exit(1)
last_done_files = []
for f in os.listdir(args.text_dir):
    if f.endswith(".txt") or f.endswith(".txt.gz"):
        last_done_files.append(os.path.join(args.text_dir, f))
done_file = os.path.join(word_counts_dir, '.done')
if not CheckFreshness(done_file, last_done_files):
    LogMessage("Skip getting word counts")
else:
    log_file = os.path.join(log_dir, 'get_word_counts.log')
    LogMessage("Getting word counts... log in " + log_file)
    command = "get_word_counts.py {0} {1}".format(args.text_dir,
                                                  word_counts_dir)
    RunCommand(command, log_file, args.verbose == 'true')
    TouchFile(done_file)

# get unigram weights
unigram_weights = os.path.join(args.text_dir, 'unigram_weights')
last_done_files = [done_file]
done_file = os.path.join(work_dir, '.unigram_weights.done')
if not CheckFreshness(done_file, last_done_files):
    LogMessage("Skip getting unigram weights")
else:
    log_file = os.path.join(log_dir, 'get_unigram_weights.log')
    LogMessage("Getting unigram weights... log in " + log_file)
    command = "get_unigram_weights.py {0} > {1}".format(
        word_counts_dir, unigram_weights)
    RunCommand(command, log_file, args.verbose == 'true')
    TouchFile(done_file)
Exemplo n.º 21
0
def GetObjfAndDeriv(x):
    global iteration

    y = UnconstrainedToConstrained(x)

    metaparameter_file = "{0}/{1}.metaparams".format(args.optimize_dir,
                                                     iteration)
    deriv_file = "{0}/{1}.derivs".format(args.optimize_dir, iteration)
    objf_file = "{0}/{1}.objf".format(args.optimize_dir, iteration)
    log_file = "{0}/{1}.log".format(args.optimize_dir, iteration)

    changed_or_new = WriteMetaparameters(metaparameter_file, y)
    prev_metaparameter_file = "{0}/{1}.metaparams".format(
        args.optimize_dir, iteration - 1)
    enable_caching = True  # if true, enable re-use of files from a previous run.
    if enable_caching and (not changed_or_new and os.path.exists(deriv_file)
                           and os.path.exists(objf_file)
                           and os.path.getmtime(deriv_file) >
                           os.path.getmtime(metaparameter_file)):
        print(
            "optimize_metaparameters.py: using previously computed objf and deriv "
            "info from {0} and {1} (presumably you are rerunning after a partially "
            "finished run)".format(deriv_file, objf_file),
            file=sys.stderr)
    else:
        # we need to call get_objf_and_derivs.py
        command = (
            "get_objf_and_derivs{maybe_split}.py {split_opt} --cleanup={cleanup} --derivs-out={derivs} {counts} {metaparams} "
            "{objf} {work}".format(
                derivs=deriv_file,
                counts=args.count_dir,
                metaparams=metaparameter_file,
                maybe_split="_split" if args.num_splits > 1 else "",
                split_opt=("--num-splits={0}".format(args.num_splits)
                           if args.num_splits > 1 else ""),
                cleanup=args.cleanup,
                objf=objf_file,
                work=args.optimize_dir + "/work"))
        RunCommand(command, log_file, verbose=True)
    df_dy = ReadMetaparametersOrDerivs(deriv_file)
    objf = ReadObjf(objf_file)
    iteration += 1

    (x2, df_dx) = ConstrainedToUnconstrained(y, df_dy)

    # check that x == x2, we just changed variables back and forth so it should
    # be the same.
    if math.sqrt(np.dot(x - x2, x - x2)) > 0.001:
        print(
            "optimize_metaparameters.py: warning: difference {0} versus {1}\n".
            format(x, x2))

    print("Evaluation %d: objf=%.6f, deriv-magnitude=%.6f " %
          (iteration, objf, math.sqrt(np.vdot(df_dx, df_dx))),
          file=sys.stderr)

    # we need to negate the objective function and derivatives, since we are
    # minimizing.
    scale = -1.0
    global value0
    if value0 is None:
        value0 = objf * scale
    return (objf * scale, df_dx * scale)
Exemplo n.º 22
0
def RunPruneStep(work_in, work_out, threshold):
    # set float_star = 'work_out/float.1 work_out/float.2 ...'
    float_star = " ".join([
        '{0}/float.{1}'.format(work_out, n) for n in range(1, ngram_order + 1)
    ])
    # create work_out/float.{1,2,..}
    log_file = work_out + '/log/float_counts_prune.log'
    command = (
        "float-counts-prune {threshold} {num_words} {work_in}/float.all "
        "{work_in}/protected.all {float_star} 2>>{log_file}".format(
            threshold=threshold,
            num_words=num_words,
            work_in=work_in,
            float_star=float_star,
            log_file=log_file))
    with open(log_file, 'w', encoding="utf-8") as f:
        print("# " + command, file=f)
    try:
        print(command, file=sys.stderr)
        p = subprocess.Popen(command,
                             stdout=subprocess.PIPE,
                             shell=True,
                             universal_newlines=True)
        [word_count, like_change] = p.stdout.readline().split()
        like_change_per_word = float(like_change) / float(word_count)
        [tot_xgrams, shadowed, protected, pruned] = p.stdout.readline().split()
        num_ngrams = p.stdout.readline().split()

        assert p.stdout.readline() == ''
        ret = p.wait()
        assert ret == 0
        global current_num_xgrams

        current_num_xgrams = int(tot_xgrams) - int(pruned)
    except Exception as e:
        ExitProgram("error running command '{0}', error is '{1}'".format(
            command, repr(e)))

    WriteNumNgrams(work_out, num_ngrams)

    if args.remove_zeros == 'false':
        # create work_out/float.all.
        command = 'merge-float-counts {0} >{1}/float.all'.format(
            float_star, work_out)
        log_file = work_out + '/log/merge_float_counts.log'
        RunCommand(command, log_file, args.verbose == 'true')
        for f in float_star.split():
            os.remove(f)
        # soft-link work_out/stats.all to work_in/stats.all
        SoftLink(work_in + "/stats.all", work_out + "/stats.all")
    else:
        # in this case we pipe the output of merge-float-counts into
        # float-counts-stats-remove-zeros.
        # set stats_star = 'work_out/stats.1 work_out/stats.2 ..'
        stats_star = " ".join([
            '{0}/stats.{1}'.format(work_out, n)
            for n in range(1, ngram_order + 1)
        ])

        command = (
            'merge-float-counts {float_star} | float-counts-stats-remove-zeros '
            '{num_words} /dev/stdin {work_in}/stats.all {work_out}/float.all '
            '{stats_star}'.format(num_words=num_words,
                                  float_star=float_star,
                                  work_in=work_in,
                                  work_out=work_out,
                                  stats_star=stats_star))
        log_file = work_out + '/log/remove_zeros.log'
        RunCommand(command, log_file, args.verbose == 'true')
        # create work_out/stats.all
        command = 'merge-float-counts {0} >{1}/stats.all'.format(
            stats_star, work_out)
        log_file = work_out + '/log/merge_float_counts.log'
        RunCommand(command, log_file, args.verbose == 'true')
        for f in float_star.split() + stats_star.split():
            os.remove(f)

    # create work_out/protected.all
    CreateProtectedCounts(work_out)
    return like_change_per_word
Exemplo n.º 23
0
def DiscountCountsOrder1():
    command = "discount-counts-1gram {num_words} <{work}/discount.1 >{work}/float.1".format(
            num_words=num_words, work=args.work_dir)
    log_file = "{0}/log/discount_counts_order1.log".format(args.work_dir)
    RunCommand(command, log_file, args.verbose == 'true')