Exemplo n.º 1
0
    def tokenize(self, text):
        """Tokenize the input text

        :param text: the text to tokenize.
        :return: a list of the sentences contained in the text. Each element is
        a list of tokens and each token is a tuple made of its raw string and
        its offset in the text.
        """
        t = time()
        grams, gram2idx = None, None

        (char2idx, unk_chars_idx,
         idx2char, tag2idx, idx2tag, _) = toolbox.get_dicts(self.path,
                                                            True,
                                                            self.tag_scheme,
                                                            self.crf)

        if self.ngram > 1:
            grams = toolbox.read_ngrams(self.path, self.ngram)
        raw_file_f = tempfile.NamedTemporaryFile(mode='w')
        raw_file_f.write(text)
        raw_file_f.flush()

        raw_file = raw_file_f.name
        new_chars, new_grams = None, None

        new_chars = toolbox.get_new_chars(raw_file, char2idx, self.is_space)

        if self.emb_path is not None:
            valid_chars = toolbox.get_valid_chars(new_chars, self.emb_path)
        else:
            valid_chars = None

        (char2idx, idx2char,
         unk_chars_idx, sub_dict) = toolbox.update_char_dict(char2idx,
                                                             new_chars,
                                                             unk_chars_idx,
                                                             valid_chars)

        raw_x, raw_len = toolbox.get_input_vec_tag(None, raw_file, char2idx,
                                                  limit=self.sent_limit + 100,
                                                  is_space=self.is_space)
        #print(f'Got raw_x={raw_x}, raw_len={raw_len}')
        if self.ngram > 1:
            gram2idx = toolbox.get_ngram_dic(grams)
            new_grams = toolbox.get_new_grams(raw_file, gram2idx, is_raw=True,
                                              is_space=self.is_space)

            raw_grams = toolbox.get_gram_vec_tag(None, raw_file, gram2idx,
                                                limit=self.sent_limit + 100,
                                                is_space=self.is_space)

            raw_x += raw_grams

        for k in range(len(raw_x)):
            raw_x[k] = toolbox.pad_zeros(raw_x[k], self.sent_limit + 100)
        with self.main_graph.as_default():
            self.model.define_updates(new_chars=new_chars,
                                emb_path=self.emb_path,
                                char2idx=char2idx)

        with tf.device(self.gpu_config):
            #print('Running updates....', file=sys.stderr)
            self.model.run_updates(self.main_sess)
            #print('Updated.', file=sys.stderr)
        #print(f'Done loading data. Time consumed: {time() - t:3.2f} seconds',
              #file=sys.stderr)
        #sys.stderr.flush()
        #t = time()

        #output_path = f'{raw_file}.conllu'
        #print(f'Before tag {raw_x}.', file=sys.stderr)
        sentences = self.model.tag(raw_x,
                                   idx2tag,
                                   idx2char,
                                   unk_chars_idx,
                                   sub_dict,
                                   self.sess,
                                   batch_size=self.tag_batch)

        #print(f'Done tokenizing. Time consumed: {time() - t:3.2f} seconds',
              #file=sys.stderr)
        #sys.stderr.flush()
        #t = time()
        return sentences
Exemplo n.º 2
0
        ignore_space=args.ignore_space)
    if args.sent_seg:
        print 'Joint sentence segmentation...'
    else:
        print 'Training set: %d instances; Dev set: %d instances.' % (len(
            train_x[0]), len(dev_x[0]))

    nums_grams = None
    ng_embs = None

    if args.ngram > 1 and (
            args.reset
            or not os.path.isfile(path + '/' + str(args.ngram) + 'gram.txt')):
        toolbox.get_ngrams(path, args.ngram, is_space)

    ngram = toolbox.read_ngrams(path, args.ngram)

    if args.ngram > 1:
        gram2idx = toolbox.get_ngram_dic(ngram)
        train_gram = toolbox.get_gram_vec(path,
                                          'tag_train.txt',
                                          gram2idx,
                                          limit=args.sent_limit,
                                          sent_seg=args.sent_seg,
                                          is_space=is_space,
                                          ignore_space=args.ignore_space)
        dev_gram = toolbox.get_gram_vec(path,
                                        'raw_dev.txt',
                                        gram2idx,
                                        is_raw=True,
                                        limit=args.sent_limit,
Exemplo n.º 3
0
        ignore_space=args.ignore_space)
    if args.sent_seg:
        print 'Joint sentence segmentation...'
    else:
        print 'Training set: %d instances; Dev set: %d instances.' % (len(
            train_x1[0]), len(dev_x1[0]))

    nums_grams = None
    ng_embs = None

    if args.ngram > 1 and (
            args.reset
            or not os.path.isfile(path_ + '/' + str(args.ngram) + 'gram.txt')):
        toolbox.get_ngrams(path_, args.ngram, is_space)

    ngram = toolbox.read_ngrams(path_, args.ngram)

    if args.ngram > 1:
        gram2idx = toolbox.get_ngram_dic(ngram)
        train_gram = toolbox.get_gram_vec(path_,
                                          'tag_train.txt',
                                          gram2idx,
                                          limit=args.sent_limit,
                                          sent_seg=args.sent_seg,
                                          is_space=is_space,
                                          ignore_space=args.ignore_space)
        dev_gram = toolbox.get_gram_vec(path_,
                                        'raw_dev.txt',
                                        gram2idx,
                                        is_raw=True,
                                        limit=args.sent_limit,