示例#1
0
        print 'Test set: %d instances.' % len(test_x[0])

        max_step = test_max_slen_c

        print 'Longest sentence by character is %d. ' % test_max_slen_c
        print 'Longest sentence by word is %d. ' % test_max_slen_w

        print 'Longest word is %d. ' % test_max_wlen

        if graphic:
            new_pixels = toolbox.get_new_pixels(new_chars, font, pic_size)
            pixels += new_pixels

        if ngram > 1:
            gram2idx = toolbox.get_ngram_dic(grams)
            new_grams = toolbox.get_new_grams(path + '/' + test_file, gram2idx)
            if args.ngram_embeddings is not None:
                new_grams = toolbox.get_valid_grams(new_grams,
                                                    args.ngram_embeddings)
                gram2idx = toolbox.update_gram_dicts(gram2idx, new_grams)

            test_gram = toolbox.get_gram_vec(path, test_file, gram2idx)
            test_x += test_gram

        for k in range(len(test_x)):
            test_x[k] = toolbox.pad_zeros(test_x[k], max_step)
        for k in range(len(test_y)):
            test_y[k] = toolbox.pad_zeros(test_y[k], max_step)

    elif args.action == 'tag':
        assert args.raw is not None
示例#2
0
    def tokenize(self, text):
        """Tokenize the input text

        :param text: the text to tokenize.
        :return: a list of the sentences contained in the text. Each element is
        a list of tokens and each token is a tuple made of its raw string and
        its offset in the text.
        """
        t = time()
        grams, gram2idx = None, None

        (char2idx, unk_chars_idx,
         idx2char, tag2idx, idx2tag, _) = toolbox.get_dicts(self.path,
                                                            True,
                                                            self.tag_scheme,
                                                            self.crf)

        if self.ngram > 1:
            grams = toolbox.read_ngrams(self.path, self.ngram)
        raw_file_f = tempfile.NamedTemporaryFile(mode='w')
        raw_file_f.write(text)
        raw_file_f.flush()

        raw_file = raw_file_f.name
        new_chars, new_grams = None, None

        new_chars = toolbox.get_new_chars(raw_file, char2idx, self.is_space)

        if self.emb_path is not None:
            valid_chars = toolbox.get_valid_chars(new_chars, self.emb_path)
        else:
            valid_chars = None

        (char2idx, idx2char,
         unk_chars_idx, sub_dict) = toolbox.update_char_dict(char2idx,
                                                             new_chars,
                                                             unk_chars_idx,
                                                             valid_chars)

        raw_x, raw_len = toolbox.get_input_vec_tag(None, raw_file, char2idx,
                                                  limit=self.sent_limit + 100,
                                                  is_space=self.is_space)
        #print(f'Got raw_x={raw_x}, raw_len={raw_len}')
        if self.ngram > 1:
            gram2idx = toolbox.get_ngram_dic(grams)
            new_grams = toolbox.get_new_grams(raw_file, gram2idx, is_raw=True,
                                              is_space=self.is_space)

            raw_grams = toolbox.get_gram_vec_tag(None, raw_file, gram2idx,
                                                limit=self.sent_limit + 100,
                                                is_space=self.is_space)

            raw_x += raw_grams

        for k in range(len(raw_x)):
            raw_x[k] = toolbox.pad_zeros(raw_x[k], self.sent_limit + 100)
        with self.main_graph.as_default():
            self.model.define_updates(new_chars=new_chars,
                                emb_path=self.emb_path,
                                char2idx=char2idx)

        with tf.device(self.gpu_config):
            #print('Running updates....', file=sys.stderr)
            self.model.run_updates(self.main_sess)
            #print('Updated.', file=sys.stderr)
        #print(f'Done loading data. Time consumed: {time() - t:3.2f} seconds',
              #file=sys.stderr)
        #sys.stderr.flush()
        #t = time()

        #output_path = f'{raw_file}.conllu'
        #print(f'Before tag {raw_x}.', file=sys.stderr)
        sentences = self.model.tag(raw_x,
                                   idx2tag,
                                   idx2char,
                                   unk_chars_idx,
                                   sub_dict,
                                   self.sess,
                                   batch_size=self.tag_batch)

        #print(f'Done tokenizing. Time consumed: {time() - t:3.2f} seconds',
              #file=sys.stderr)
        #sys.stderr.flush()
        #t = time()
        return sentences
示例#3
0
            limit=args.sent_limit + 100,
            sent_seg=sent_seg,
            is_space=is_space,
            ignore_space=args.ignore_space)

        max_step = max_len_test

        if sent_seg:
            print 'Joint sentence segmentation...'
        else:
            print 'Test set: %d instances.' % len(test_x[0])

        if ngram > 1:
            gram2idx = toolbox.get_ngram_dic(grams)
            new_grams = toolbox.get_new_grams(path + '/' + test_file,
                                              gram2idx,
                                              is_space=is_space)

            test_grams = toolbox.get_gram_vec(path,
                                              'raw_test.txt',
                                              gram2idx,
                                              is_raw=True,
                                              limit=args.sent_limit + 100,
                                              sent_seg=sent_seg,
                                              is_space=is_space,
                                              ignore_space=args.ignore_space)
            test_x += test_grams

        for k in range(len(test_x)):
            test_x[k] = toolbox.pad_zeros(test_x[k], max_step)
示例#4
0
            if sent_seg:
                print 'Joint sentence segmentation...'
            else:
                print 'Raw setences: %d instances.' % len(raw_x[0])

            max_step = raw_len

        else:

            max_step = args.sent_limit

        if ngram > 1:
            gram2idx = toolbox.get_ngram_dic(grams)
            new_grams = toolbox.get_new_grams(raw_file,
                                              gram2idx,
                                              is_raw=True,
                                              is_space=is_space)

            if not args.segment_large:
                if sent_seg:
                    raw_grams = toolbox.get_gram_vec_tag(
                        None,
                        raw_file,
                        gram2idx,
                        limit=args.sent_limit + 100,
                        is_space=is_space)
                else:
                    raw_grams = toolbox.get_gram_vec(None,
                                                     raw_file,
                                                     gram2idx,
                                                     is_raw=True,