train_x, train_y, max_len_train = toolbox.get_input_vec( path, 'tag_train.txt', char2idx, tag2idx, limit=args.sent_limit, sent_seg=args.sent_seg, is_space=is_space, train_size=args.train_size, ignore_space=args.ignore_space) dev_x, max_len_dev = toolbox.get_input_vec_raw( path, 'raw_dev.txt', char2idx, limit=args.sent_limit, sent_seg=args.sent_seg, is_space=is_space, ignore_space=args.ignore_space) if args.sent_seg: print 'Joint sentence segmentation...' else: print 'Training set: %d instances; Dev set: %d instances.' % (len( train_x[0]), len(dev_x[0])) nums_grams = None ng_embs = None if args.ngram > 1 and ( args.reset or not os.path.isfile(path + '/' + str(args.ngram) + 'gram.txt')):
new_chars = toolbox.get_new_chars(raw_file, char2idx, type='raw') valid_chars = None if args.embeddings is not None: valid_chars = toolbox.get_valid_chars(new_chars, args.embeddings) char2idx, idx2char, unk_char2idx = toolbox.update_char_dict( char2idx, new_chars, valid_chars) if not args.tag_large: raw_x, raw_len = toolbox.get_input_vec_raw(None, raw_file, char2idx, rad_dic=rad_dic) print 'Numbers of sentences: %d.' % len(raw_x[0]) max_step = raw_len else: max_step = toolbox.get_maxstep(raw_file, args.bucket_size) print 'Longest sentence is %d. ' % max_step if graphic: new_pixels = toolbox.get_new_pixels(new_chars, font, pic_size) pixels += new_pixels if ngram > 1:
char2idx, new_chars, unk_chars_idx, valid_chars) if not args.segment_large: if sent_seg: raw_x, raw_len = toolbox.get_input_vec_tag( None, raw_file, char2idx, limit=args.sent_limit + 100, is_space=is_space) else: raw_x, raw_len = toolbox.get_input_vec_raw( None, raw_file, char2idx, limit=args.sent_limit + 100, sent_seg=sent_seg, is_space=is_space) if sent_seg: print 'Joint sentence segmentation...' else: print 'Raw setences: %d instances.' % len(raw_x[0]) max_step = raw_len else: max_step = args.sent_limit