pixels = toolbox.read_chars_pixels(path, font_name, pic_size) s_time = time() if args.action == 'test': assert args.test is not None test_file = args.test new_chars = toolbox.get_new_chars(path + '/' + test_file, char2idx) valid_chars = None if args.embeddings is not None: valid_chars = toolbox.get_valid_chars(new_chars, args.embeddings) char2idx, idx2char, unk_char2idx = toolbox.update_char_dict( char2idx, new_chars, valid_chars) test_x, test_y, test_max_slen_c, test_max_slen_w, test_max_wlen = \ toolbox.get_input_vec(path, test_file, char2idx, tag2idx, tag_scheme=tag_scheme, rad_dic=rad_dic) print 'Test set: %d instances.' % len(test_x[0]) max_step = test_max_slen_c print 'Longest sentence by character is %d. ' % test_max_slen_c print 'Longest sentence by word is %d. ' % test_max_slen_w print 'Longest word is %d. ' % test_max_wlen if graphic: new_pixels = toolbox.get_new_pixels(new_chars, font, pic_size)
def tokenize(self, text): """Tokenize the input text :param text: the text to tokenize. :return: a list of the sentences contained in the text. Each element is a list of tokens and each token is a tuple made of its raw string and its offset in the text. """ t = time() grams, gram2idx = None, None (char2idx, unk_chars_idx, idx2char, tag2idx, idx2tag, _) = toolbox.get_dicts(self.path, True, self.tag_scheme, self.crf) if self.ngram > 1: grams = toolbox.read_ngrams(self.path, self.ngram) raw_file_f = tempfile.NamedTemporaryFile(mode='w') raw_file_f.write(text) raw_file_f.flush() raw_file = raw_file_f.name new_chars, new_grams = None, None new_chars = toolbox.get_new_chars(raw_file, char2idx, self.is_space) if self.emb_path is not None: valid_chars = toolbox.get_valid_chars(new_chars, self.emb_path) else: valid_chars = None (char2idx, idx2char, unk_chars_idx, sub_dict) = toolbox.update_char_dict(char2idx, new_chars, unk_chars_idx, valid_chars) raw_x, raw_len = toolbox.get_input_vec_tag(None, raw_file, char2idx, limit=self.sent_limit + 100, is_space=self.is_space) #print(f'Got raw_x={raw_x}, raw_len={raw_len}') if self.ngram > 1: gram2idx = toolbox.get_ngram_dic(grams) new_grams = toolbox.get_new_grams(raw_file, gram2idx, is_raw=True, is_space=self.is_space) raw_grams = toolbox.get_gram_vec_tag(None, raw_file, gram2idx, limit=self.sent_limit + 100, is_space=self.is_space) raw_x += raw_grams for k in range(len(raw_x)): raw_x[k] = toolbox.pad_zeros(raw_x[k], self.sent_limit + 100) with self.main_graph.as_default(): self.model.define_updates(new_chars=new_chars, emb_path=self.emb_path, char2idx=char2idx) with tf.device(self.gpu_config): #print('Running updates....', file=sys.stderr) self.model.run_updates(self.main_sess) #print('Updated.', file=sys.stderr) #print(f'Done loading data. Time consumed: {time() - t:3.2f} seconds', #file=sys.stderr) #sys.stderr.flush() #t = time() #output_path = f'{raw_file}.conllu' #print(f'Before tag {raw_x}.', file=sys.stderr) sentences = self.model.tag(raw_x, idx2tag, idx2char, unk_chars_idx, sub_dict, self.sess, batch_size=self.tag_batch) #print(f'Done tokenizing. Time consumed: {time() - t:3.2f} seconds', #file=sys.stderr) #sys.stderr.flush() #t = time() return sentences
raws_test = reader.raw(path + '/raw_test.txt') test_y_gold = reader.test_gold(path + '/' + test_file, form=args.format, is_space=is_space, ignore_mwt=args.ignore_mwt) new_chars = toolbox.get_new_chars(path + '/raw_test.txt', char2idx, is_space) if emb_path is not None: valid_chars = toolbox.get_valid_chars(new_chars + char2idx.keys(), emb_path) else: valid_chars = None char2idx, idx2char, unk_chars_idx, sub_dict = toolbox.update_char_dict( char2idx, new_chars, unk_chars_idx, valid_chars) test_x, max_len_test = toolbox.get_input_vec_raw( path, 'raw_test.txt', char2idx, limit=args.sent_limit + 100, sent_seg=sent_seg, is_space=is_space, ignore_space=args.ignore_space) max_step = max_len_test if sent_seg: print 'Joint sentence segmentation...' else:
max_step = None if radical: rad_dic = toolbox.get_radical_dic() if graphic: font_name = font[:font.index('.')] pixels = toolbox.read_chars_pixels(path, font_name, pic_size) if args.action == 'test': assert args.test is not None test_file = args.test new_chars = toolbox.get_new_chars(path + '/' + test_file, char2idx) char2idx, idx2char = toolbox.update_char_dict(char2idx, new_chars) test_x, test_y, test_max_slen_c, test_max_slen_w, test_max_wlen, _ = toolbox.get_input_vec(path, test_file, char2idx, tag2idx, tag_scheme=tag_scheme, rad_dic=rad_dic) print 'Test set: %d instances.' % len(test_x[0]) max_step = test_max_slen_c print 'Longest sentence by character is %d. ' % test_max_slen_c print 'Longest sentence by word is %d. ' % test_max_slen_w print 'Longest word is %d. ' % test_max_wlen if graphic: new_pixels = toolbox.get_new_pixels(new_chars, font, pic_size) pixels += new_pixels