示例#1
0
    s_time = None

    if radical:
        rad_dic = toolbox.get_radical_dic()

    if graphic:
        font_name = font[:font.index('.')]
        pixels = toolbox.read_chars_pixels(path, font_name, pic_size)

    s_time = time()
    if args.action == 'test':
        assert args.test is not None

        test_file = args.test
        new_chars = toolbox.get_new_chars(path + '/' + test_file, char2idx)

        valid_chars = None

        if args.embeddings is not None:

            valid_chars = toolbox.get_valid_chars(new_chars, args.embeddings)

        char2idx, idx2char, unk_char2idx = toolbox.update_char_dict(
            char2idx, new_chars, valid_chars)

        test_x, test_y, test_max_slen_c, test_max_slen_w, test_max_wlen = \
            toolbox.get_input_vec(path, test_file, char2idx, tag2idx, tag_scheme=tag_scheme, rad_dic=rad_dic)

        print 'Test set: %d instances.' % len(test_x[0])
示例#2
0
    def tokenize(self, text):
        """Tokenize the input text

        :param text: the text to tokenize.
        :return: a list of the sentences contained in the text. Each element is
        a list of tokens and each token is a tuple made of its raw string and
        its offset in the text.
        """
        t = time()
        grams, gram2idx = None, None

        (char2idx, unk_chars_idx,
         idx2char, tag2idx, idx2tag, _) = toolbox.get_dicts(self.path,
                                                            True,
                                                            self.tag_scheme,
                                                            self.crf)

        if self.ngram > 1:
            grams = toolbox.read_ngrams(self.path, self.ngram)
        raw_file_f = tempfile.NamedTemporaryFile(mode='w')
        raw_file_f.write(text)
        raw_file_f.flush()

        raw_file = raw_file_f.name
        new_chars, new_grams = None, None

        new_chars = toolbox.get_new_chars(raw_file, char2idx, self.is_space)

        if self.emb_path is not None:
            valid_chars = toolbox.get_valid_chars(new_chars, self.emb_path)
        else:
            valid_chars = None

        (char2idx, idx2char,
         unk_chars_idx, sub_dict) = toolbox.update_char_dict(char2idx,
                                                             new_chars,
                                                             unk_chars_idx,
                                                             valid_chars)

        raw_x, raw_len = toolbox.get_input_vec_tag(None, raw_file, char2idx,
                                                  limit=self.sent_limit + 100,
                                                  is_space=self.is_space)
        #print(f'Got raw_x={raw_x}, raw_len={raw_len}')
        if self.ngram > 1:
            gram2idx = toolbox.get_ngram_dic(grams)
            new_grams = toolbox.get_new_grams(raw_file, gram2idx, is_raw=True,
                                              is_space=self.is_space)

            raw_grams = toolbox.get_gram_vec_tag(None, raw_file, gram2idx,
                                                limit=self.sent_limit + 100,
                                                is_space=self.is_space)

            raw_x += raw_grams

        for k in range(len(raw_x)):
            raw_x[k] = toolbox.pad_zeros(raw_x[k], self.sent_limit + 100)
        with self.main_graph.as_default():
            self.model.define_updates(new_chars=new_chars,
                                emb_path=self.emb_path,
                                char2idx=char2idx)

        with tf.device(self.gpu_config):
            #print('Running updates....', file=sys.stderr)
            self.model.run_updates(self.main_sess)
            #print('Updated.', file=sys.stderr)
        #print(f'Done loading data. Time consumed: {time() - t:3.2f} seconds',
              #file=sys.stderr)
        #sys.stderr.flush()
        #t = time()

        #output_path = f'{raw_file}.conllu'
        #print(f'Before tag {raw_x}.', file=sys.stderr)
        sentences = self.model.tag(raw_x,
                                   idx2tag,
                                   idx2char,
                                   unk_chars_idx,
                                   sub_dict,
                                   self.sess,
                                   batch_size=self.tag_batch)

        #print(f'Done tokenizing. Time consumed: {time() - t:3.2f} seconds',
              #file=sys.stderr)
        #sys.stderr.flush()
        #t = time()
        return sentences
示例#3
0
        cat = 'other'
        if 'Chinese' in path or 'Japanese' in path:
            cat = 'zh'
        for line in codecs.open(path + '/' + test_file, 'r', encoding='utf-8'):
            if len(line) < 2:
                break
            if '# sentence' in line or '# text' in line:
                cat = 'gold'
        reader.get_raw(path, test_file, 'raw_test.txt', cat, form=args.format)

        raws_test = reader.raw(path + '/raw_test.txt')
        test_y_gold = reader.test_gold(path + '/' + test_file,
                                       form=args.format,
                                       is_space=is_space)

        new_chars = toolbox.get_new_chars(path + '/raw_test.txt', char2idx,
                                          is_space)

        if emb_path is not None:
            valid_chars = toolbox.get_valid_chars(new_chars + char2idx.keys(),
                                                  emb_path)
        else:
            valid_chars = None

        char2idx, idx2char, unk_chars = toolbox.update_char_dict(
            char2idx, new_chars, unk_chars, valid_chars)

        test_x, max_len_test = toolbox.get_input_vec_raw(
            path,
            'raw_test.txt',
            char2idx,
            limit=args.sent_limit + 100,
示例#4
0
                break
            if '# sentence' in line or '# text' in line:
                cat = 'gold'
        reader.get_raw(test_language_dir,
                       test_file,
                       'raw_test.txt',
                       cat,
                       form=args.format)

        raws_test = reader.raw(test_language_dir + '/raw_test.txt')
        test_y_gold = reader.test_gold(test_language_dir + '/' + test_file,
                                       form=args.format,
                                       is_space=is_space,
                                       ignore_mwt=args.ignore_mwt)

        new_chars = toolbox.get_new_chars(test_language_dir + '/raw_test.txt',
                                          char2idx, is_space)

        if emb_path is not None:
            valid_chars = toolbox.get_valid_chars(new_chars + char2idx.keys(),
                                                  emb_path)
        else:
            valid_chars = None

        char2idx, idx2char, unk_chars_idx, sub_dict = toolbox.update_char_dict(
            char2idx, new_chars, unk_chars_idx, valid_chars)

        test_x1, test_x2, max_len_test = toolbox.get_input_vec_raw_test_new(
            test_language_dir,
            test_language,
            'raw_test.txt',
            char2idx,