ngram = args.ngram
    if not os.path.isfile(path + '/' + str(ngram) + 'gram.txt') \
            or (not os.path.isfile(path + '/' + 'chars.txt')):
        toolbox.get_vocab_tag(path, [train_file, dev_file], ngram=ngram)
    # 读取文本信息
    chars, tags, ngram = toolbox.read_vocab_tag(path, ngram)

    emb = None
    emb_dim = args.embeddings_dimension
    if args.embeddings is not None:
        # 读取预训练字向量
        print 'Reading embeddings...'
        short_emb = args.embeddings[args.embeddings.index('/') +
                                    1:args.embeddings.index('.')]
        if not os.path.isfile(path + '/' + short_emb + '_sub.txt'):
            toolbox.get_sample_embedding(path, args.embeddings,
                                         map(lambda x: x[0], chars))
        emb_dim, emb = toolbox.read_sample_embedding(path, short_emb)
        assert args.embeddings_dimension == emb_dim
    else:
        print 'Using random embeddings...'

    char2idx, idx2char, char2freq, tag2idx, idx2tag = toolbox.get_dic(
        chars, tags, args.char_freq_loss)

    # train_x: shape=(2,句子数量),2 表示字符本身+偏旁部首
    train_x, train_y, train_max_slen_c, train_max_slen_w, train_max_wlen = \
        toolbox.get_input_vec(path, train_file, char2idx, tag2idx, char2freq, tag_scheme=args.tag_scheme)
    dev_x, dev_y, dev_max_slen_c, dev_max_slen_w, dev_max_wlen = \
        toolbox.get_input_vec(path, dev_file, char2idx, tag2idx, char2freq, tag_scheme=args.tag_scheme)

    # 读取 ngram 向量
Exemplo n.º 2
0
    if args.ngram > 1 and not os.path.isfile(path + '/' + str(args.ngram) + 'gram.txt') \
            or (not os.path.isfile(path + '/' + 'chars.txt')):
        toolbox.get_vocab_tag(path, [train_file, dev_file], ngram=args.ngram)
    # 读取文本信息
    chars, tags, ngram = toolbox.read_vocab_tag(path, args.ngram)

    # 读取预训练字向量
    emb = None
    emb_dim = args.embeddings_dimension
    if args.word_vector:
        if args.embeddings is not None:
            print 'Reading embeddings...'
            short_emb = args.embeddings[args.embeddings.index('/') +
                                        1:args.embeddings.index('.')]
            if not os.path.isfile(path + '/' + short_emb + '_sub.txt'):
                toolbox.get_sample_embedding(path, args.embeddings, chars)
            emb_dim, emb = toolbox.read_sample_embedding(path, short_emb)
            assert args.embeddings_dimension == emb_dim
        else:
            print 'Using random embeddings...'
    else:
        assert args.pixels

    # 读取偏旁部首字典
    rad_dic = None
    if args.radical:
        print 'Using Radical dictionary...'
        rad_dic = toolbox.get_radical_dic()

    # 读取字符图像信息
    pixels = None
Exemplo n.º 3
0
                                     tag_scheme=args.tags)

        if args.reset or not os.path.isfile(path + '/chars.txt'):
            toolbox.get_chars(path, ['raw_train.txt', 'raw_dev.txt'],
                              sea=is_space)

    char2idx, unk_chars_idx, idx2char, tag2idx, idx2tag, trans_dict = toolbox.get_dicts_new(
        path_, args.sent_seg, args.tags, args.crf)

    if args.embeddings is not None:
        print 'Reading embeddings...'
        short_emb = args.embeddings[args.embeddings.index('/') +
                                    1:args.embeddings.index('.')]
        if args.reset or not os.path.isfile(path_ + '/' + short_emb +
                                            '_sub.txt'):
            toolbox.get_sample_embedding(path_, args.embeddings, char2idx)
        emb_dim, emb, valid_chars = toolbox.read_sample_embedding(
            path_, short_emb, char2idx)
        for vch in valid_chars:
            if char2idx[vch] in unk_chars_idx:
                unk_chars_idx.remove(char2idx[vch])
    else:
        emb_dim = args.emb_dimension
        emb = None

    train_x1, train_x2, train_y, max_len_train = toolbox.get_input_vec_new(
        path_,
        char2idx,
        tag2idx,
        limit=args.sent_limit,
        sent_seg=args.sent_seg,