def tokenize(self, text): """Tokenize the input text :param text: the text to tokenize. :return: a list of the sentences contained in the text. Each element is a list of tokens and each token is a tuple made of its raw string and its offset in the text. """ t = time() grams, gram2idx = None, None (char2idx, unk_chars_idx, idx2char, tag2idx, idx2tag, _) = toolbox.get_dicts(self.path, True, self.tag_scheme, self.crf) if self.ngram > 1: grams = toolbox.read_ngrams(self.path, self.ngram) raw_file_f = tempfile.NamedTemporaryFile(mode='w') raw_file_f.write(text) raw_file_f.flush() raw_file = raw_file_f.name new_chars, new_grams = None, None new_chars = toolbox.get_new_chars(raw_file, char2idx, self.is_space) if self.emb_path is not None: valid_chars = toolbox.get_valid_chars(new_chars, self.emb_path) else: valid_chars = None (char2idx, idx2char, unk_chars_idx, sub_dict) = toolbox.update_char_dict(char2idx, new_chars, unk_chars_idx, valid_chars) raw_x, raw_len = toolbox.get_input_vec_tag(None, raw_file, char2idx, limit=self.sent_limit + 100, is_space=self.is_space) #print(f'Got raw_x={raw_x}, raw_len={raw_len}') if self.ngram > 1: gram2idx = toolbox.get_ngram_dic(grams) new_grams = toolbox.get_new_grams(raw_file, gram2idx, is_raw=True, is_space=self.is_space) raw_grams = toolbox.get_gram_vec_tag(None, raw_file, gram2idx, limit=self.sent_limit + 100, is_space=self.is_space) raw_x += raw_grams for k in range(len(raw_x)): raw_x[k] = toolbox.pad_zeros(raw_x[k], self.sent_limit + 100) with self.main_graph.as_default(): self.model.define_updates(new_chars=new_chars, emb_path=self.emb_path, char2idx=char2idx) with tf.device(self.gpu_config): #print('Running updates....', file=sys.stderr) self.model.run_updates(self.main_sess) #print('Updated.', file=sys.stderr) #print(f'Done loading data. Time consumed: {time() - t:3.2f} seconds', #file=sys.stderr) #sys.stderr.flush() #t = time() #output_path = f'{raw_file}.conllu' #print(f'Before tag {raw_x}.', file=sys.stderr) sentences = self.model.tag(raw_x, idx2tag, idx2char, unk_chars_idx, sub_dict, self.sess, batch_size=self.tag_batch) #print(f'Done tokenizing. Time consumed: {time() - t:3.2f} seconds', #file=sys.stderr) #sys.stderr.flush() #t = time() return sentences
ignore_space=args.ignore_space) if args.sent_seg: print 'Joint sentence segmentation...' else: print 'Training set: %d instances; Dev set: %d instances.' % (len( train_x[0]), len(dev_x[0])) nums_grams = None ng_embs = None if args.ngram > 1 and ( args.reset or not os.path.isfile(path + '/' + str(args.ngram) + 'gram.txt')): toolbox.get_ngrams(path, args.ngram, is_space) ngram = toolbox.read_ngrams(path, args.ngram) if args.ngram > 1: gram2idx = toolbox.get_ngram_dic(ngram) train_gram = toolbox.get_gram_vec(path, 'tag_train.txt', gram2idx, limit=args.sent_limit, sent_seg=args.sent_seg, is_space=is_space, ignore_space=args.ignore_space) dev_gram = toolbox.get_gram_vec(path, 'raw_dev.txt', gram2idx, is_raw=True, limit=args.sent_limit,
ignore_space=args.ignore_space) if args.sent_seg: print 'Joint sentence segmentation...' else: print 'Training set: %d instances; Dev set: %d instances.' % (len( train_x1[0]), len(dev_x1[0])) nums_grams = None ng_embs = None if args.ngram > 1 and ( args.reset or not os.path.isfile(path_ + '/' + str(args.ngram) + 'gram.txt')): toolbox.get_ngrams(path_, args.ngram, is_space) ngram = toolbox.read_ngrams(path_, args.ngram) if args.ngram > 1: gram2idx = toolbox.get_ngram_dic(ngram) train_gram = toolbox.get_gram_vec(path_, 'tag_train.txt', gram2idx, limit=args.sent_limit, sent_seg=args.sent_seg, is_space=is_space, ignore_space=args.ignore_space) dev_gram = toolbox.get_gram_vec(path_, 'raw_dev.txt', gram2idx, is_raw=True, limit=args.sent_limit,