Пример #1
0
            if opts.onlyfold is not None and i != opts.onlyfold:
                continue
            left = i*foldlen
            right = min(totalnum, (i+1)*foldlen)

            print >> logs, i, "\t [%d, %d) \t%d lines" % (left, right, right-left)

            os.system("mkdir " + thisdir[i])

            infold_input = open(thisdir[i] + "/toparse.ecinput", "wt")
            outfold_gold = open(thisdir[i] + "/totrain.cleangold", "wt")
            infold_gold = open(thisdir[i] + "/toparse.cleangold", "wt")

            for j, line in enumerate(goldtrees[left : right]):
                start = "<s small.%d.%d>" % (i, j+1)
                print >> infold_input,  start, " ".join(words_from_line(line)), "</s>"

            print >> infold_gold, "".join(goldtrees[left : right]),

            print >> outfold_gold, "".join(goldtrees[:left] + goldtrees[right:]),
        
####### TRAINING *****************

    print >> logs, "**************************** training folds ************************"
    if opts.fromstep <= 1:

        traindir = os.environ["HOME"] + "/rerank/first-stage/TRAIN"
        trainscript = traindir + "/allScript"

        for i in xrange(opts.numfolds):
Пример #2
0
#!/usr/bin/env python

import sys
from utility import num_words, words_from_line

## cat trees.txt | filter_by_length.py [-w] [<max_len>]

if __name__ == "__main__":

    print_words = False
    if sys.argv[1] == "-w":
        ## words
        print_words = True
        del sys.argv[1]

    try:
        max_len = int(sys.argv[1])
    except:
        max_len = 400

    for line in sys.stdin:

        words = words_from_line(line)
        length = len(words)
        if length <= max_len:
            print " ".join(words) if print_words else line.strip()