sentences = [] words = [] for line in f: if line != '\n': words.append(line) else: sentence = {'tokens': ['ROOT'], 'arcs': [], 'pos':['ROOT']} for word in words: token_index, token, _, pos, _, _, head_index, label, _ = word.split('\t', 8) sentence['tokens'].append(token) sentence['pos'].append(pos) sentence['arcs'].append((head_index, token_index, label)) #do whatever preprocessing you want here sentences.append(sentence) if not size == None: if len(sentences) >= size: break words = [] return sentences if __name__ == '__main__': training_set = read_sentences('wsj.02-21.conll') dev_set = read_sentences('wsj.00.01.22.24.conll') test_set = read_sentences('wsj.23.conll') feature_generators = feat_gen_lists[int(sys.argv[2])] parser = Parser(feature_generators, decay = bool(int(sys.argv[3]))) parser.train(training_set, dev_set) parser._add_to_caches(test_set, 'test', False) print "Final UAS: %f" % parser.evaluate(test_set, 'test') parser.serialize('features_%s_%s.pickle' % (sys.argv[2], sys.argv[3]))