示例#1
0
def CombineNumNgrams():
    tot_nums = []
    for split_index in range(1, args.num_splits + 1):
        this_split_work = "{0}/{1}".format(split_work_dir, split_index)
        num_file = this_split_work + "/num_ngrams"
        try:
            f = open(num_file, "r")
            for order, line in enumerate(f):
                num = int(line.split()[1])
                assert(num > 0)

                if order == 0:
                    if len(tot_nums) == 0:
                        tot_nums.append(num)
                    else:
                        if tot_nums[0] != num:
                            ExitProgram("get_objf_and_derivs_split.py: num-unigrams are not identical")
                else:
                    if len(tot_nums) < order + 1:
                        tot_nums.append(num)
                    else:
                        tot_nums[order] += num

            f.close()
        except:
            ExitProgram("get_objf_and_derivs_split.py: error reading num-ngrams from: " + num_file)

    WriteNumNgrams(args.work_dir, tot_nums)
示例#2
0
def GetInitialLogprob():
    work0dir = work_dir + "/step0"
    float_star = ' '.join(['/dev/null' for n in range(1, ngram_order + 1)])
    command = ('float-counts-estimate {num_words} {work0dir}/float.all '
               '{work0dir}/stats.all {float_star} '.format(
                   num_words=num_words,
                   work0dir=work0dir,
                   float_star=float_star))
    try:
        print(command, file=sys.stderr)
        p = subprocess.Popen(command,
                             stdout=subprocess.PIPE,
                             shell=True,
                             universal_newlines=True)
        # the stdout of this program will be something like:
        # 1.63388e+06 -7.39182e+06 10.5411 41.237 49.6758
        # representing: total-count, total-like, and for each order, the like-change
        # for that order.
        line = p.stdout.readline()
        print(line, file=sys.stderr)
        a = line.split()
        tot_count = float(a[0])
        tot_like = float(a[1])
        like_change = 0.0
        logprob_per_word = tot_like / tot_count
        for i in range(2, len(a)):  # for each n-gram order
            like_change += float(a[i])
        like_change_per_word = like_change / tot_count
        assert like_change_per_word < 0.0001  # should be exactly zero.
    except Exception as e:
        ExitProgram("error running command '{0}', error is '{1}'".format(
            command, repr(e)))
    global initial_logprob_per_word
    initial_logprob_per_word = logprob_per_word
示例#3
0
def MergeCountsBackward(split_index, order):
    global scale_derivs
    # merge counts of the specified order > 1; the backprop phase.
    assert order > 1

    command = "merge-counts-backward {swork}/{s}/merged.{order} {swork}/{s}/merged_derivs.{order} ".format(
            swork=split_work_dir, s=split_index, order=order)

    for n in range(1, num_train_sets + 1):
        command += " {split_counts}/{s}/int.{train_set}.{order} {scale}".format(
                split_counts=split_count_dir, s=split_index, train_set=n,
                order=order, scale=train_set_scale[n])
    # for orders less than the highest order, we also have to include the
    # discount counts from the one-higher order, and provide a filename
    # for it to output the derivatives w.r.t. that file.
    if order < ngram_order:
        command += " {swork}/{s}/discount.{order} {swork}/{s}/discount_derivs.{order}".format(
                swork=split_work_dir, s=split_index, order=order)

    log_file = "{0}/log/merge_counts_backward.{1}.{2}.log".format(args.work_dir, split_index, order)
    output = GetCommandStdout(command, log_file, args.verbose == 'true')
    try:
        this_scale_derivs = [float(n) / num_dev_set_words_total for n in output.split()]
        assert len(scale_derivs) == num_train_sets
        # the scaling factors are applied for each order > 1, and the
        # derivatives will be a sum over the derivatives for each of these
        # orders (and also a sum over the different split-directories).
        for n in range(num_train_sets):
            scale_derivs[n] += this_scale_derivs[n]
    except:
        ExitProgram("get_objf_and_derivs_split.py: unexpected output from command:" + output)
示例#4
0
def FindThreshold(initial_threshold):
    global initial_num_xgrams, current_num_xgrams, num_unigrams, steps
    global logprob_changes, effective_logprob_changes

    model = PruneSizeModel(num_unigrams, args.target_num_ngrams,
                           args.target_lower_threshold,
                           args.target_upper_threshold)
    #    model.SetDebug(True)

    model.SetInitialThreshold(initial_threshold, initial_num_xgrams)

    cur_threshold = initial_threshold
    backtrack_iter = 0
    step = 0
    iter2step = [
        0
    ]  # This maps a iter-index to the step-index of the last step of that iteration
    while True:
        steps += ['prune*1.0']
        logprob_change = RunStep(step,
                                 cur_threshold,
                                 in_step=iter2step[backtrack_iter])
        logprob_changes.append(logprob_change)
        effective_logprob_changes.append(logprob_change)
        thresholds.append(cur_threshold)
        step += 1

        (action, arguments) = model.GetNextAction(current_num_xgrams)
        if action == 'overshoot':
            return (0.0, 0)

        if action == 'backtrack':
            (cur_threshold, backtrack_iter) = arguments
            assert (iter2step[backtrack_iter] > 0)
            del effective_logprob_changes[iter2step[backtrack_iter]:]
            iter2step.append(-1)
            continue

        # EM steps
        steps += 'EM EM'.split()
        while step < len(steps):
            logprob_change = RunStep(step, 0.0)
            logprob_changes.append(logprob_change)
            effective_logprob_changes.append(logprob_change)
            step += 1

        iter2step.append(step)

        if action == 'success':
            return (cur_threshold, model.iter)

        # action == 'continue':
        if model.iter > args.max_iter:
            ExitProgram(
                "Too many iterations, please set a higher --initial-threshold and rerun."
            )

        cur_threshold = arguments
        backtrack_iter = model.iter
示例#5
0
def CopyMetaInfo(source_int_dir, dest_count_dir):
    for f in ['num_train_sets', 'num_words', 'names', 'words.txt']:
        try:
            src = source_int_dir + os.path.sep + f
            dest = dest_count_dir + os.path.sep + f
            shutil.copy(src, dest)
        except:
            ExitProgram('error copying {0} to {1}'.format(src, dest))
示例#6
0
def SoftLink(src, dest):
    if os.path.lexists(dest):
        os.remove(dest)
    try:
        os.symlink(os.path.abspath(src), dest)
    except:
        ExitProgram("error linking {0} to {1}".format(os.path.abspath(src),
                                                      dest))
示例#7
0
def WriteNumNgrams(out_dir, num_ngrams):
    out_file = out_dir + "/num_ngrams"
    try:
        f = open(out_file, "w")
        for order, num in enumerate(num_ngrams):
            print(str(order + 1) + ' ' + str(num), file=f)
        f.close()
    except:
        ExitProgram("get_objf_and_derivs_split.py: error writing num-ngrams to: " + out_file)
示例#8
0
def WriteNumNgrams(out_dir, num_ngrams):
    out_file = out_dir + "/num_ngrams"
    try:
        f = open(out_file, "w", encoding="utf-8")
        for order, num in enumerate(num_ngrams):
            print(str(order + 1) + ' ' + str(num), file=f)
        f.close()
    except:
        ExitProgram("error writing num-ngrams to: " + out_file)
示例#9
0
def SaveNgramOrder(dest_count_dir, ngram_order):
    try:
        f = open('{0}/ngram_order'.format(dest_count_dir), 'w')
    except:
        ExitProgram('error opening file {0}/ngram_order for writing'.format(
            dest_count_dir))
    assert ngram_order >= 2
    print(ngram_order, file=f)
    f.close()
示例#10
0
def WriteObjectiveFunction():
    objf = loglike_total / num_dev_set_words_total
    print("get_objf_and_derivs_split.py: objf is {0} over {1} "
          "words".format(objf, num_dev_set_words_total), file=sys.stderr)
    # Write the objective function.
    try:
        f = open(args.objf_out, "w")
        print(str(objf), file=f)
        f.close()
    except:
        ExitProgram("get_objf_and_derivs_split.py: error writing objective function to: " +
                    args.objf_out)
示例#11
0
def ParseNumNgrams(out_dir, merge_all_orders_log):
    try:
        num_ngrams = []
        f = open(merge_all_orders_log, "r")
        for line in f:
            if line[0] == '#':
                continue
            m = re.search('Write (.*) individual n-grams.', line)
            if m:
                # The matched string should be 'num1 + num2 = tot' or just 'num1' for unigram model
                nums_str = m.group(1).split('=')[0]
                nums_str = nums_str.strip()
                num_ngrams = re.split('[\+| ]+', nums_str)
        f.close()
    except:
        ExitProgram("get_objf_and_derivs_split.py: error reading merge_all_orders_log from: " + merge_all_orders_log)

    if len(num_ngrams) == 0:
        ExitProgram("get_objf_and_derivs_split.py: error parsing num_ngrams from: " + merge_all_orders_log)

    WriteNumNgrams(out_dir, num_ngrams)
示例#12
0
def GetNumTrainSets(int_dir):
    with open(int_dir) as f:
        for line in f:
            try:
                a = line.split()
                assert len(a) == 2
                ans = int(a[0])
            except:
                ExitProgram("failed to get the num_train_sets from {0}".format(
                    int_dir))

    return ans
示例#13
0
def GetNumWords(vocab):
    command = "tail -n 1 {0}".format(vocab)
    line = subprocess.check_output(command,
                                   shell=True,
                                   universal_newlines=True)
    try:
        a = line.split()
        assert len(a) == 2
        ans = int(a[1])
    except:
        ExitProgram("failed to get the num_words from {0}".format(vocab))

    return ans
示例#14
0
def DivideMemory(total, n):
    (value, unit) = ParseMemoryString(total)
    sub_memory = value / n
    if sub_memory != float(value) / n:
        if unit in ['K', 'k', '']:
            sub_memory = value * 1024 / n
            unit = 'b'
        elif unit in ['M', 'm']:
            sub_memory = value * 1024 / n
            unit = 'K'
        elif unit in ['G', 'g']:
            sub_memory = value * 1024 / n
            unit = 'M'
        elif (unit in ['B', 'b', '%']) and (sub_memory == 0):
            ExitProgram("max_memory for each of the {0} train sets is {1}{2}."
                        "Please reset a larger max_memory value".format(
                            n,
                            float(value) / n, unit))
        else:
            ExitProgram("Invalid format for max_memory. "
                        "Please 'man sort' to see how to set buffer size.")
    return str(int(sub_memory)) + unit
示例#15
0
def GetNumWords(lm_dir_in):
    command = "tail -n 1 {0}/words.txt".format(lm_dir_in)
    line = subprocess.check_output(command,
                                   shell=True,
                                   universal_newlines=True)
    try:
        a = line.split()
        assert len(a) == 2
        ans = int(a[1])
    except:
        ExitProgram("error: unexpected output '{0}' from command {1}".format(
            line, command))
    return ans
示例#16
0
def WriteDerivs():
    try:
        f = open(args.derivs_out, "w")
    except:
        ExitProgram("get_objf_and_derivs_split.py: error opening --derivs-out={0} for writing".format(
                 args.derivs_out))
    for n in range(num_train_sets):
        print("count_scale_{0} {1}".format(n + 1, scale_derivs[n]), file=f)
    for o in range(2, ngram_order + 1):
        print("order{0}_D1 {1}".format(o, d1_deriv[o]), file=f)
        print("order{0}_D2 {1}".format(o, d2_deriv[o]), file=f)
        print("order{0}_D3 {1}".format(o, d3_deriv[o]), file=f)
        print("order{0}_D4 {1}".format(o, d4_deriv[o]), file=f)
    f.close()
示例#17
0
def RunStep(step_number, threshold, **kwargs):
    if 'in_step' in kwargs:
        work_in = work_dir + "/step" + str(kwargs['in_step'])
    else:
        work_in = work_dir + "/step" + str(step_number)
    work_out = work_dir + "/step" + str(step_number + 1)
    if not os.path.isdir(work_out + "/log"):
        os.makedirs(work_out + "/log")
    step_text = steps[step_number]
    if step_text[0:6] == 'prune*':
        try:
            scale = float(step_text[6:])
            assert scale != 0.0
        except:
            ExitProgram("invalid step (wrong --steps "
                        "option): '{0}'".format(step_text))
        return RunPruneStep(work_in, work_out, threshold * scale)

    elif step_text == 'EM':
        return RunEmStep(work_in, work_out)
    else:
        ExitProgram("invalid step (wrong --steps "
                    "option): '{0}'".format(step_text))
示例#18
0
def FinalizeOutput(final_work_out):
    try:
        shutil.move(final_work_out + "/float.all",
                    args.lm_dir_out + "/float.all")
    except:
        ExitProgram("error moving {0}/float.all to {1}/float.all".format(
            final_work_out, args.lm_dir_out))
    try:
        shutil.copy(final_work_out + "/num_ngrams",
                    args.lm_dir_out + "/num_ngrams")
    except:
        ExitProgram("error copying {0}/num_ngrams to {1}/num_ngrams".format(
            final_work_out, args.lm_dir_out))
    f = open(args.lm_dir_out + "/was_pruned", "w", encoding="utf-8")
    print("true", file=f)
    f.close()
    for f in ['names', 'words.txt', 'ngram_order', 'metaparameters']:
        try:
            shutil.copy(args.lm_dir_in + "/" + f, args.lm_dir_out + "/" + f)
        except:
            ExitProgram("error copying {0}/{1} to {2}/{1}".format(
                args.lm_dir_in, f, args.lm_dir_out))
    if os.path.exists(args.lm_dir_out + "/num_splits"):
        os.remove(args.lm_dir_out + "/num_splits")
示例#19
0
def RunEmStep(work_in, work_out):
    # set float_star = 'work_out/float.1 work_out/float.2 ...'
    float_star = " ".join([
        '{0}/float.{1}'.format(work_out, n) for n in range(1, ngram_order + 1)
    ])

    command = (
        'float-counts-estimate {num_words} {work_in}/float.all {work_in}/stats.all '
        '{float_star}'.format(num_words=num_words,
                              work_in=work_in,
                              float_star=float_star))
    log_file = work_out + "/log/float_counts_estimate.log"
    try:
        output = GetCommandStdout(command, log_file, args.verbose == 'true')
        # the stdout of this program will be something like:
        # 1.63388e+06 -7.39182e+06 10.5411 41.237 49.6758
        # representing: total-count, total-like, and for each order, the like-change
        # for that order.
        a = output.split()
        tot_count = float(a[0])
        tot_like = float(a[1])
        like_change = 0.0
        global final_logprob_per_word
        final_logprob_per_word = tot_like / tot_count
        for i in range(2, len(a)):  # for each n-gram order
            like_change += float(a[i])
        like_change_per_word = like_change / tot_count
    except Exception as e:
        ExitProgram("error running command '{0}', error is '{1}'".format(
            command, repr(e)))

    command = 'merge-float-counts {0} >{1}/float.all'.format(
        float_star, work_out)
    log_file = work_out + '/log/merge_float_counts.log'
    RunCommand(command, log_file, args.verbose == 'true')
    for f in float_star.split():
        os.remove(f)
    # soft-link work_out/stats.all to work_in/stats.all
    SoftLink(work_in + "/stats.all", work_out + "/stats.all")
    # soft-link work_out/protected.all to work_in/protected.all
    SoftLink(work_in + "/protected.all", work_out + "/protected.all")
    SoftLink(work_in + "/num_ngrams", work_out + "/num_ngrams")
    return like_change_per_word
示例#20
0
def ComputeObjfAndFinalDerivs(split_index, need_derivs):
    global num_dev_set_words_total, loglike_total
    command = "compute-probs {swork}/{s}/float.all {scount}/{s}/int.dev ".format(
            swork=split_work_dir, s=split_index, scount=split_count_dir)
    if need_derivs:
        command += " ".join(["{swork}/{s}/float_derivs.{order}".format(
            swork=split_work_dir, s=split_index, order=o)
            for o in range(1, ngram_order + 1)])

    log_file = "{0}/log/compute_objf_and_final_derivs.{1}.log".format(
            args.work_dir, split_index)
    output = GetCommandStdout(command, log_file, args.verbose == 'true')
    try:
        [num_dev_set_words, tot_objf] = output.split()
        num_dev_set_words_total += int(num_dev_set_words)
        loglike_total += float(tot_objf)
    except:
        ExitProgram("get_objf_and_derivs_split.py: error interpreting the output of compute-probs: "
                    "output was: " + output)
示例#21
0
def DiscountCountsBackward(split_index, order):
    # discount counts of the specified order > 1; backprop version.
    assert order > 1
    this_split_work = "{0}/{1}".format(split_work_dir, split_index)
    command = ("discount-counts-backward {d1} {d2} {d3} {d4} {sdir}/merged.{order} {sdir}/float.{order} "
               "{sdir}/float_derivs.{order} {sdir}/discount.{orderm1} {sdir}/discount_derivs.{orderm1} "
               "{sdir}/merged_derivs.{order}".format(
                   d1=d1[order], d2=d2[order], d3=d3[order], d4=d4[order],
                   sdir=this_split_work, order=order, orderm1=order - 1))
    log_file = "{0}/log/discount_counts_backward.{1}.{2}.log".format(args.work_dir,
                                                                     split_index, order)
    output = GetCommandStdout(command, log_file, args.verbose == 'true')
    try:
        [deriv1, deriv2, deriv3, deriv4] = output.split()
    except:
        ExitProgram("get_objf_and_derivs_split.py: could not parse output of command: " + output)
    d1_deriv[order] += float(deriv1) / num_dev_set_words_total
    d2_deriv[order] += float(deriv2) / num_dev_set_words_total
    d3_deriv[order] += float(deriv3) / num_dev_set_words_total
    d4_deriv[order] += float(deriv4) / num_dev_set_words_total
示例#22
0
def CopyFile(src, dest):
    try:
        shutil.copy(src, dest)
    except:
        ExitProgram("prepare_int_data.py: error copying {0} to {1}".format(
            src, dest))
示例#23
0
                            float(value) / n, unit))
        else:
            ExitProgram("Invalid format for max_memory. "
                        "Please 'man sort' to see how to set buffer size.")
    return str(int(sub_memory)) + unit


# make sure 'scripts' and 'src' directory are on the path
os.environ['PATH'] = (os.environ['PATH'] + os.pathsep +
                      os.path.abspath(os.path.dirname(sys.argv[0])) +
                      os.pathsep +
                      os.path.abspath(os.path.dirname(sys.argv[0])) +
                      "/../src")

if os.system("validate_int_dir.py " + args.source_int_dir) != 0:
    ExitProgram("command validate_int_dir.py {0} failed".format(
        args.source_int_dir))

if args.ngram_order < 2:
    ExitProgram("ngram-order is {0}; it must be at least 2.  If you "
                "want a unigram LM, do it by hand".format(args.ngram_order))

# read the variable 'num_train_sets'
# from the corresponding file in source_int_dir  This shouldn't fail
# because we just called validate_int-dir.py..
f = open(args.source_int_dir + "/num_train_sets")
num_train_sets = int(f.readline())
f.close()

if not os.path.isdir(args.dest_count_dir):
    try:
        os.makedirs(args.dest_count_dir + '/log')
示例#24
0
            .format(args.derivs_out))
    for n in range(num_train_sets):
        print("count_scale_{0} {1}".format(n + 1, scale_derivs[n]), file=f)
    for o in range(2, ngram_order + 1):
        print("order{0}_D1 {1}".format(o, d1_deriv[o]), file=f)
        print("order{0}_D2 {1}".format(o, d2_deriv[o]), file=f)
        print("order{0}_D3 {1}".format(o, d3_deriv[o]), file=f)
        print("order{0}_D4 {1}".format(o, d4_deriv[o]), file=f)
    f.close()


if not os.path.isdir(args.work_dir + "/log"):
    try:
        os.makedirs(args.work_dir + "/log")
    except:
        ExitProgram("error creating directory {0}/log".format(args.work_dir))

# for n-gram orders down to 2, do the merging and discounting.
for o in range(ngram_order, 1, -1):
    MergeCounts(o)
    DiscountCounts(o)

DiscountCountsOrder1()
MergeAllOrders()
ComputeObjfAndFinalDerivs(args.derivs_out is not None)

if args.derivs_out is None:
    if args.cleanup == 'true':
        Cleanup()
    sys.exit(0)
示例#25
0
def RunPruneStep(work_in, work_out, threshold):
    # set float_star = 'work_out/float.1 work_out/float.2 ...'
    float_star = " ".join([
        '{0}/float.{1}'.format(work_out, n) for n in range(1, ngram_order + 1)
    ])
    # create work_out/float.{1,2,..}
    log_file = work_out + '/log/float_counts_prune.log'
    command = (
        "float-counts-prune {threshold} {num_words} {work_in}/float.all "
        "{work_in}/protected.all {float_star} 2>>{log_file}".format(
            threshold=threshold,
            num_words=num_words,
            work_in=work_in,
            float_star=float_star,
            log_file=log_file))
    with open(log_file, 'w', encoding="utf-8") as f:
        print("# " + command, file=f)
    try:
        print(command, file=sys.stderr)
        p = subprocess.Popen(command,
                             stdout=subprocess.PIPE,
                             shell=True,
                             universal_newlines=True)
        [word_count, like_change] = p.stdout.readline().split()
        like_change_per_word = float(like_change) / float(word_count)
        [tot_xgrams, shadowed, protected, pruned] = p.stdout.readline().split()
        num_ngrams = p.stdout.readline().split()

        assert p.stdout.readline() == ''
        ret = p.wait()
        assert ret == 0
        global current_num_xgrams

        current_num_xgrams = int(tot_xgrams) - int(pruned)
    except Exception as e:
        ExitProgram("error running command '{0}', error is '{1}'".format(
            command, repr(e)))

    WriteNumNgrams(work_out, num_ngrams)

    if args.remove_zeros == 'false':
        # create work_out/float.all.
        command = 'merge-float-counts {0} >{1}/float.all'.format(
            float_star, work_out)
        log_file = work_out + '/log/merge_float_counts.log'
        RunCommand(command, log_file, args.verbose == 'true')
        for f in float_star.split():
            os.remove(f)
        # soft-link work_out/stats.all to work_in/stats.all
        SoftLink(work_in + "/stats.all", work_out + "/stats.all")
    else:
        # in this case we pipe the output of merge-float-counts into
        # float-counts-stats-remove-zeros.
        # set stats_star = 'work_out/stats.1 work_out/stats.2 ..'
        stats_star = " ".join([
            '{0}/stats.{1}'.format(work_out, n)
            for n in range(1, ngram_order + 1)
        ])

        command = (
            'merge-float-counts {float_star} | float-counts-stats-remove-zeros '
            '{num_words} /dev/stdin {work_in}/stats.all {work_out}/float.all '
            '{stats_star}'.format(num_words=num_words,
                                  float_star=float_star,
                                  work_in=work_in,
                                  work_out=work_out,
                                  stats_star=stats_star))
        log_file = work_out + '/log/remove_zeros.log'
        RunCommand(command, log_file, args.verbose == 'true')
        # create work_out/stats.all
        command = 'merge-float-counts {0} >{1}/stats.all'.format(
            stats_star, work_out)
        log_file = work_out + '/log/merge_float_counts.log'
        RunCommand(command, log_file, args.verbose == 'true')
        for f in float_star.split() + stats_star.split():
            os.remove(f)

    # create work_out/protected.all
    CreateProtectedCounts(work_out)
    return like_change_per_word
示例#26
0
def GetCountsMultiProcess(source_int_dir,
                          dest_count_dir,
                          ngram_order,
                          n,
                          num_proc,
                          max_mem,
                          num_splits=0):
    try:
        file_size = os.path.getsize('{0}/{1}.txt.gz'.format(source_int_dir, n))
    except:
        ExitProgram('get_counts.py: error getting file size of '
                    '{0}/{1}.txt.gz'.format(source_int_dir, n))

    if IsCygwin() or num_proc <= 1 or file_size < 1000000:
        if num_proc > 1 and file_size >= 1000000:
            # it's only because of Cygwin that we're not using multiple
            # processes this merits a warning.
            print(
                "get_counts.py: cygwin platform detected so named pipes won't work; "
                "using a single process (will be slower)")
        return GetCountsSingleProcess(source_int_dir, dest_count_dir,
                                      ngram_order, n, max_mem, num_splits)

    if num_splits == 0:
        int_counts_output = "/dev/null " + " ".join([
            "{0}/int.{1}.{2}".format(dest_count_dir, n, o)
            for o in range(2, ngram_order + 1)
        ])
    else:
        assert num_splits >= 1
        int_counts_output = '/dev/stdout | split-int-counts ' + \
            ' '.join(["{0}/int.{1}.split{2}".format(dest_count_dir, n, j)
                     for j in range(1, num_splits + 1)])

    try:
        # we want a temporary directory on a local file system
        # for
        tempdir = tempfile.mkdtemp()
    except Exception as e:
        ExitProgram("Error creating temporary directory: " + str(e))

    # This has several pipes for the internal processing that write to and read
    # from other internal pipes; and we can't do this using '|' in the shell, we
    # need to use mkfifo.  This does not work properly on cygwin.

    log_dir = "{dest_count_dir}/log".format(dest_count_dir=dest_count_dir)
    [
        os.remove(x)
        for x in glob.glob("{log_dir}/.{n}.*.error".format(log_dir=log_dir,
                                                           n=n))
    ]

    log_file = "{log_dir}/get_counts.{n}.log".format(log_dir=log_dir, n=n)

    test_command = "bash -c 'set -o pipefail; (echo a; echo b) | "\
        "distribute-input-lines /dev/null /dev/null'"
    # We run the following command just to make sure distribute-input-lines is
    # on the path and compiled, since we get hard-to-debug errors if it fails.
    RunCommand(test_command, log_file)

    if max_mem == '':
        mem_opt = ''
    else:
        mem_opt = "--buffer-size={0}".format(
            DivideMemory(max_mem, num_proc + 1))
    # we use "bash -c '...'" to make sure it gets run in bash, since
    # for example 'set -o pipefail' would only work in bash.
    command = (
        "bash -c 'set -o pipefail; set -e; export LC_ALL=C; mkdir -p {0}; ".
        format(tempdir) + ''.join(
            ['mkfifo {0}/{1}; '.format(tempdir, p) for p in range(num_proc)]) +
        'trap "rm -r {0}" SIGINT SIGKILL SIGTERM EXIT; '.format(tempdir) +
        'gunzip -c {0}/{1}.txt.gz | distribute-input-lines '.format(
            source_int_dir, n) +
        ' '.join(['{0}/{1}'.format(tempdir, p) for p in range(num_proc)]) +
        '& ' + 'sort -m {0} '.format(mem_opt) + ' '.join([
            '<(get-text-counts {4} {0} <{1}/{2} | sort {3} || touch {5}/.{6}.{2}.error)'
            .format(
                ngram_order, tempdir, p, mem_opt, "--limit-unk-history"
                if args.limit_unk_history == 'true' else "", log_dir, n)
            for p in range(num_proc)
        ]) + '| uniq -c | get-int-counts {0}'.format(int_counts_output) + "'"
    )  # end the quote from the 'bash -c'.

    RunCommand(command, log_file, args.verbose == 'true')

    if len(glob.glob("{log_dir}/.{n}.*.error".format(log_dir=log_dir,
                                                     n=n))) > 0:
        ExitProgram(
            "Something went wrong for the get-text-counts or sort command for training set {n}."
            .format(n=n))
示例#27
0
        log_file = "{int_dir}/log/{int}.log".format(int_dir=args.int_dir,
                                                    int=int)
        output = GetCommandStdout(command, log_file)


# make sure 'scripts', 'scripts/internal', and 'src' directory are on the path
os.environ['PATH'] = (os.environ['PATH'] + os.pathsep +
                      os.path.abspath(os.path.dirname(sys.argv[0])) +
                      os.pathsep +
                      os.path.abspath(os.path.dirname(sys.argv[0])) +
                      "/../src" + os.pathsep +
                      os.path.abspath(os.path.dirname(sys.argv[0])) +
                      "/internal")

if os.system("validate_text_dir.py " + args.text_dir) != 0:
    ExitProgram("command validate_text_dir.py {0} failed".format(
        args.text_dir))

if os.system("validate_vocab.py " + args.vocab) != 0:
    ExitProgram("command validate_vocab.py {0} failed".format(args.vocab))

if not os.path.exists(
        os.path.abspath(os.path.dirname(sys.argv[0])) + "/text_to_int.py"):
    ExitProgram(
        "prepare_int_data.py: expected text_to_int.py to be on the path")

# create the output data directory
if not os.path.exists(args.int_dir + "/log"):
    os.makedirs(args.int_dir + "/log")

# remove any old *.int.gz files in the output data directory
filelist = [f for f in os.listdir(args.int_dir) if f.endswith(".int.gz")]
示例#28
0
parser.add_argument("lm_dir_in",
                    help="Source directory, for the input language model.")
parser.add_argument(
    "lm_dir_out", help="Output directory where the language model is created.")

args = parser.parse_args()

# Add the script dir and the src dir to the path.
os.environ['PATH'] = (os.environ['PATH'] + os.pathsep +
                      os.path.abspath(os.path.dirname(sys.argv[0])) +
                      os.pathsep +
                      os.path.abspath(os.path.dirname(sys.argv[0])) +
                      "/../src")

if os.system("validate_lm_dir.py " + args.lm_dir_in) != 0:
    ExitProgram("failed to validate input LM-dir")

# verify the input string max_memory
if args.max_memory != '':
    # valid string max_memory must have at least two items
    if len(args.max_memory) >= 2:
        s = args.max_memory
        # valid string max_memory can be formatted as:
        # "a positive integer + a letter or a '%'" or "a positive integer"
        # the unit of memory size can also be 'T', 'P', 'E', 'Z', or 'Y'. They
        # are not included here considering their rare use in practice
        if s[-1] in ['b', 'B', '%', 'k', 'K', 'm', 'M', 'g', 'G'
                     ] or s[-1].isdigit():
            for x in s[:-1]:
                if not x.isdigit():
                    sys.exit(
示例#29
0
            "{lm_dir}/float.all.{n} | sort {mem_opt} || touch {lm_dir}/.{n}.error)"
            .format(opt=('--no-unigram' if n > 1 else ''),
                    ngram_order=ngram_order,
                    num_words=num_words,
                    lm_dir=args.lm_dir,
                    n=n,
                    mem_opt=mem_opt) for n in range(1, num_splits + 1)
        ]) +
        " | pre-arpa-to-arpa {lm_dir}/words.txt'".format(lm_dir=args.lm_dir))

print("format_arpa_lm.py: running " + command, file=sys.stderr)

ret = os.system(command)

if ret != 0:
    sys.exit("format_arpa_lm.py: command {0} exited with status {1}".format(
        command, ret))

if len(glob.glob("{lm_dir}/.*.error".format(lm_dir=args.lm_dir))) > 0:
    ExitProgram(
        "Something went wrong for the float-counts-to-pre-arpa or sort command."
    )

print("format_arpa_lm.py: succeeded formatting ARPA lm from {0}".format(
    args.lm_dir),
      file=sys.stderr)

t1 = time.time()
print('Total time formatting to ARPA = ' + str(t1 - t0), file=sys.stderr)
print('-' * 100, file=sys.stderr)