Exemplos de init_data em Python, exemplos de load_data.init_data em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: train.py Projeto: xyangk/NER-LSTM-CRF

def main():
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
        feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name]['path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)

    # 加载数据

    # 加载vocs
    path_vocs = []
    for feature_name in feature_names:
        path_vocs.append(config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载训练数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    data_dict = init_data(
        path=config['data_params']['path_train'], feature_names=feature_names, sep=sep,
        vocs=vocs, max_len=config['model_params']['sequence_length'], model='train')

    # 训练模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        clip=config['model_params']['clip'],
        path_model=config['model_params']['path_model'])

    model.fit(
        data_dict=data_dict, dev_size=config['model_params']['dev_size'])

Exemplo n.º 2

0

Exibir arquivo

Arquivo: load_ner_tfserving_model.py Projeto: princepurohit153/ner-tensorflow-serving-tornado

def predict(testlist):
    """Prepare the model input data type
    paramters：
        testlist: list,[[[u'T'], [u':'],...]
    return result_sequences: list,
    """
    feature_names, sep, vocs, max_len, use_char_feature, word_len = load_parameters()
    
    data_dict = init_data( feature_names=feature_names,
                           sep=sep,test_sens=testlist,vocs=vocs, max_len=max_len,
                           model='test',use_char_feature=use_char_feature,word_len=word_len)
    
    # 生成模型feed data 
    data_count = data_dict['f1'].shape[0]
    nb_test = int(math.ceil(data_count /16.0))
    result_sequences = []  # 标记结果
    for i in range(nb_test):
        feed_dict = dict()
        batch_indices = np.arange(i * 16, (i + 1) * 16) \
            if (i+1)*16 <= data_count else \
            np.arange(i*16, data_count)
        batch_data = data_dict['f1'][batch_indices]
        item = {'input_x_f1': batch_data}
        feed_dict.update(item)
        # dropout
        item = {'weight_dropout_ph_dict_f1': np.array(0.0, dtype=np.float32)}
        
        feed_dict.update(item)
        
        feed_dict.update({'dropout_rate_ph':np.array(0.,dtype=np.float32), 'rnn_dropout_rate_ph': np.array(0.,dtype=np.float32)})
        
        # viterbi decode procedure
        logits, sequence_actual_length, transition_params = load_ner_service(feed_dict)
        
        for logit, seq_len in zip(logits, sequence_actual_length):
            logit_actual = logit[:seq_len]
            viterbi_sequence, _ = tf.contrib.crf.viterbi_decode(logit_actual, transition_params)
            result_sequences.append(viterbi_sequence)

    return result_sequences

Exemplo n.º 3

0

Exibir arquivo

Arquivo: tfServing_model_predict.py Projeto: princepurohit153/ner-tensorflow-serving-tornado

def pre_feed_data(testlist):
    # 加载配置文件
    feature_names, sep, vocs, max_len, use_char_feature, word_len = load_parameters(
    )
    data_dict = init_data(feature_names=feature_names,
                          sep=sep,
                          test_sens=testlist,
                          vocs=vocs,
                          max_len=max_len,
                          model='test',
                          use_char_feature=use_char_feature,
                          word_len=word_len)

    # 生成模型feed data
    data_count = data_dict['f1'].shape[0]
    nb_test = int(math.ceil(data_count / 16.0))
    result_sequences = []  # 标记结果
    for i in range(nb_test):
        feed_dict = dict()
        batch_indices = np.arange(i * 16, (i + 1) * 16) \
            if (i+1)*16 <= data_count else \
            np.arange(i*16, data_count)
        batch_data = data_dict['f1'][batch_indices]
        item = {'input_x_f1': batch_data}
        feed_dict.update(item)
        # dropout
        item = {'weight_dropout_ph_dict_f1': np.array(0.0, dtype=np.float32)}

        feed_dict.update(item)

        feed_dict.update({
            'dropout_rate_ph': np.array(0.0, dtype=np.float32),
            'rnn_dropout_rate_ph': np.array(0.0, dtype=np.float32)
        })

        print 'feed_dict_tfserving', feed_dict
        yield feed_dict

Exemplo n.º 4

0

Exibir arquivo

        # print('adj2.size = ', adj2.size())
        output = model.forward(input1, input2, adj1, adj2)
        _, pre = torch.max(output, dim=1)
        tmp = np.zeros((1, 1), dtype=np.int)
        tmp[0][0] = label
        tmp_label = torch.from_numpy(tmp)
        pre = pre.cuda()
        tmp_label = tmp_label.cuda()
        if pre[0] == tmp_label[0][0]:
            acc_num += 1
    # print("accuracy : ", acc_num, "of ", test_count)
    return acc_num * 1.0 / test_count


if __name__ == '__main__':
    my_graphs, max_node_num1, max_node_num2 = init_data(datadir, dataname)
    random.shuffle(my_graphs)
    print("数据处理完成", time.asctime(time.localtime(time.time())))
    model = GraphClassifier(max_node_num1, max_node_num2)
    model = model.cuda()
    print('model:', model)
    crossentropy = torch.nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    print('开始训练', time.asctime(time.localtime(time.time())))
    max_acc = 0
    print_loss = 0
    for epoch in range(num_epoches):
        for i in range(len(my_graphs)):
            if i > index:
                break
            torch.cuda.empty_cache()

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test.py Projeto: zyfnhct/NER-LSTM-CRF

def main():
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
        feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name]['path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)
    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None
    # 加载数据

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']['path'])
    for feature_name in feature_names:
        path_vocs.append(config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']
    data_dict = init_data(
        path=config['data_params']['path_test'], feature_names=feature_names, sep=sep,
        vocs=vocs, max_len=max_len, model='test', use_char_feature=use_char_feature,
        word_len=word_len)

    # 加载模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'], feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        word_length=word_len,
        path_model=config['model_params']['path_model'])
    saver = tf.train.Saver()
    saver.restore(model.sess, config['model_params']['path_model'])

    # 标记
    viterbi_sequences = model.predict(data_dict)

    # 写入文件
    label_voc = dict()
    for key in vocs[-1]:
        label_voc[vocs[-1][key]] = key
    with codecs.open(config['data_params']['path_test'], 'r', encoding='utf-8') as file_r:
        sentences = file_r.read().strip().split('\n\n')
    file_result = codecs.open(
        config['data_params']['path_result'], 'w', encoding='utf-8')
    for i, sentence in enumerate(sentences):
        for j, item in enumerate(sentence.split('\n')):
            if j < len(viterbi_sequences[i]):
                file_result.write('%s\t%s\n' % (item, label_voc[viterbi_sequences[i][j]]))
            else:
                file_result.write('%s\tO\n' % item)
        file_result.write('\n')

    file_result.close()

Exemplo n.º 6

0

Exibir arquivo

Arquivo: train.py Projeto: xiaopangxia/DS_CTT

def main():
    # 加载配置文件
    print("config5")
    with open('./train_config/config_b2b_tag_5_only_jieba.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
        feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)
    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None

    # 加载数据

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']['path'])
    for feature_name in feature_names:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载训练数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']
    data_dict = init_data(path=config['data_params']['path_train'],
                          feature_names=feature_names,
                          sep=sep,
                          vocs=vocs,
                          max_len=max_len,
                          model='train',
                          use_char_feature=use_char_feature,
                          word_len=word_len)

    # 训练模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        rnn_dropout=config['model_params']['bilstm_params']['rnn_dropout'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'],
        feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        clip=config['model_params']['clip'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        cnn_dropout_rate=config['model_params']['conv_dropout'],
        word_length=word_len,
        path_model=config['model_params']['path_model'])

    model.fit(data_dict=data_dict, dev_size=config['model_params']['dev_size'])

Exemplo n.º 7

0

Exibir arquivo

def predict(string):
    choiceAction = []
    choiceTarget = []
    choiceData = []
    lab = writetxt(string)
    # 加载数据
    if len(lab[0]) == 0:
        return 'ok;None'
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    data_dict = init_data(path=config['data_params']['path_test'],
                          feature_names=feature_names,
                          sep=sep,
                          vocs=vocs,
                          max_len=config['model_params']['sequence_length'],
                          model='test')

    saver = tf.train.Saver()
    saver.restore(model.sess, config['model_params']['path_model'])

    seq = model.predict(data_dict)
    print(seq)
    for i in range(len(seq)):
        if (vocs[-1]['B_ACT'] in seq[i] or vocs[-1]['I_ACT'] in seq[i]
                or vocs[-1]['E_ACT'] in seq[i] or vocs[-1]['S_ACT'] in seq[i]):
            tem = ""
            for j in range(len(seq[i])):
                if seq[i][j] == vocs[-1]['B_ACT']:
                    tem = lab[i][j]
                elif seq[i][j] == vocs[-1]['I_ACT']:
                    tem += lab[i][j]
                elif seq[i][j] == vocs[-1]['E_ACT']:
                    tem += lab[i][j]
                    choiceAction.append(tem)
                elif seq[i][j] == vocs[-1]['S_ACT']:
                    choiceAction.append(lab[i][j])
            # if len(tem) > 0:
            #     choiceAction.append(tem)
    ch = '***'.join(choiceAction)
    finalAction = '' + ch
    if finalAction == '':
        finalAction = '0'
    for i in range(len(seq)):
        if (vocs[-1]['B_TAR'] in seq[i] or vocs[-1]['I_TAR'] in seq[i]
                or vocs[-1]['E_TAR'] in seq[i] or vocs[-1]['S_TAR'] in seq[i]):
            tem = ""
            for j in range(len(seq[i])):
                if seq[i][j] == vocs[-1]['B_TAR']:
                    tem = lab[i][j]
                elif seq[i][j] == vocs[-1]['I_TAR']:
                    tem += lab[i][j]
                elif seq[i][j] == vocs[-1]['E_TAR']:
                    tem += lab[i][j]
                    choiceTarget.append(tem)
                elif seq[i][j] == vocs[-1]['S_TAR']:
                    choiceTarget.append(lab[i][j])
                # if seq[i][j] == 11:
                #     choiceTarget.append(lab[i][j])
            # if len(tem) > 0:
            #     choiceTarget.append(tem)
    ch = '***'.join(choiceTarget)
    finalTarget = '' + ch
    if finalTarget == '':
        finalTarget = '0'
    for i in range(len(seq)):
        if (vocs[-1]['B_DAT'] in seq[i] or vocs[-1]['I_DAT'] in seq[i]
                or vocs[-1]['E_DAT'] in seq[i] or vocs[-1]['S_DAT'] in seq[i]):
            tem = ""
            for j in range(len(seq[i])):
                if seq[i][j] == vocs[-1]['B_DAT']:
                    tem = lab[i][j]
                elif seq[i][j] == vocs[-1]['I_DAT']:
                    tem += lab[i][j]
                elif seq[i][j] == vocs[-1]['E_DAT']:
                    tem += lab[i][j]
                    choiceData.append(tem)
                elif seq[i][j] == vocs[-1]['S_DAT']:
                    choiceData.append(lab[i][j])
            # if len(tem) > 0:
            #     choiceData.append(tem)
    ch = '***'.join(choiceData)
    finalData = '' + ch
    if finalData == '':
        finalData = '0'
    return finalAction, finalTarget, finalData

Exemplo n.º 8

0

Exibir arquivo

def main():
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    logger.info(feature_names)
    use_char_feature = config['model_params']['use_char_feature']
    logger.info(use_char_feature)
    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict = dict()
    feature_weight_dropout_dict = dict()
    feature_init_weight_dict = dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = config['model_params'][
            'embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = config['model_params'][
            'embed_params'][feature_name]['dropout_rate']
        # embeding mat, 比voc多了两行， 因为voc从2开始编序， 0， 1行用0填充
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path']  # 词嵌矩阵位置
        # logger.info("%s init mat path: %s" % (feature_name, path_pre_train))
        with open(path_pre_train, 'rb') as file_r:
            feature_init_weight_dict[feature_name] = pickle.load(file_r)
    logger.info(feature_weight_dropout_dict)
    logger.info(feature_weight_shape_dict)
    logger.info(feature_init_weight_dict)

    # char embedding shape
    if use_char_feature:
        # 暂时不考虑
        feature_weight_shape_dict['char'] = config['model_params'][
            'embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        # 利用卷集层来提取char的信息
        conv_filter_len_list = None
        conv_filter_size_list = None

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']
                         ['path'])  # vocs用于将文本数字序列化
    for feature_name in feature_names:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载训练数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']  # 数据的分隔方式
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']

    # 通过voc 将input f1 和输出 label 数字序列化 得到训练的输入和输出
    # data_dict = None
    data_dict = init_data(path=config['data_params']['path_train'],
                          feature_names=feature_names,
                          sep=sep,
                          vocs=vocs,
                          max_len=max_len,
                          model='train',
                          use_char_feature=use_char_feature,
                          word_len=word_len)
    logger.info(data_dict)  # 每个特征序列化后的数据
    # 训练模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],  # 句子被固定长度
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        rnn_dropout=config['model_params']['bilstm_params']['rnn_dropout'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'],
        feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        clip=config['model_params']['clip'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        cnn_dropout_rate=config['model_params']['conv_dropout'],
        word_length=word_len,
        path_model=config['model_params']['path_model'],
        last_train_sess_path=None,  # 为了加快训练的速度我们继续载入前面训练的参数
        transfer=False)  # 是否对前面载入的参数进行迁移学习，True的话就重置LSTM的输出层

    model.fit(data_dict=data_dict, dev_size=config['model_params']['dev_size'])
    """

Exemplo n.º 9

0

Exibir arquivo

Arquivo: predict.py Projeto: princepurohit153/ner-tensorflow-serving-tornado

def predict(testlist):
    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    feature_names = config['model_params']['feature_names']
    use_char_feature = config['model_params']['use_char_feature']

    # 初始化embedding shape, dropouts, 预训练的embedding也在这里初始化)
    feature_weight_shape_dict, feature_weight_dropout_dict, \
    feature_init_weight_dict = dict(), dict(), dict()
    for feature_name in feature_names:
        feature_weight_shape_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['shape']
        feature_weight_dropout_dict[feature_name] = \
            config['model_params']['embed_params'][feature_name]['dropout_rate']
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path']
        if path_pre_train:
            with open(path_pre_train, 'rb') as file_r:
                feature_init_weight_dict[feature_name] = pickle.load(file_r)
    # char embedding shape
    if use_char_feature:
        feature_weight_shape_dict['char'] = \
            config['model_params']['embed_params']['char']['shape']
        conv_filter_len_list = config['model_params']['conv_filter_len_list']
        conv_filter_size_list = config['model_params']['conv_filter_size_list']
    else:
        conv_filter_len_list = None
        conv_filter_size_list = None

    # 加载vocs
    path_vocs = []
    if use_char_feature:
        path_vocs.append(config['data_params']['voc_params']['char']['path'])
    for feature_name in feature_names:
        path_vocs.append(
            config['data_params']['voc_params'][feature_name]['path'])
    path_vocs.append(config['data_params']['voc_params']['label']['path'])
    vocs = load_vocs(path_vocs)

    # 加载数据
    sep_str = config['data_params']['sep']
    assert sep_str in ['table', 'space']
    sep = '\t' if sep_str == 'table' else ' '
    max_len = config['model_params']['sequence_length']
    word_len = config['model_params']['word_length']
    data_dict = init_data(path=config['data_params']['path_test'],
                          feature_names=feature_names,
                          sep=sep,
                          test_sens=testlist,
                          vocs=vocs,
                          max_len=max_len,
                          model='test',
                          use_char_feature=use_char_feature,
                          word_len=word_len)

    # 加载模型
    model = SequenceLabelingModel(
        sequence_length=config['model_params']['sequence_length'],
        nb_classes=config['model_params']['nb_classes'],
        nb_hidden=config['model_params']['bilstm_params']['num_units'],
        num_layers=config['model_params']['bilstm_params']['num_layers'],
        feature_weight_shape_dict=feature_weight_shape_dict,
        feature_init_weight_dict=feature_init_weight_dict,
        feature_weight_dropout_dict=feature_weight_dropout_dict,
        dropout_rate=config['model_params']['dropout_rate'],
        nb_epoch=config['model_params']['nb_epoch'],
        feature_names=feature_names,
        batch_size=config['model_params']['batch_size'],
        train_max_patience=config['model_params']['max_patience'],
        use_crf=config['model_params']['use_crf'],
        l2_rate=config['model_params']['l2_rate'],
        rnn_unit=config['model_params']['rnn_unit'],
        learning_rate=config['model_params']['learning_rate'],
        use_char_feature=use_char_feature,
        conv_filter_size_list=conv_filter_size_list,
        conv_filter_len_list=conv_filter_len_list,
        word_length=word_len,
        path_model=config['model_params']['path_model'])
    saver = tf.train.Saver()
    saver.restore(model.sess, config['model_params']['path_model'])

    # print('data_dict', data_dict)
    # 标记
    result_sequences = model.predict(data_dict)

    #print('result_sequences', result_sequences)

    # 输出结果
    label_voc = dict()
    for key in vocs[-1]:
        label_voc[vocs[-1][key]] = key

    outlist = []
    for i, sentence in enumerate(testlist):
        templist = []
        for j, item in enumerate(sentence):
            #char = recheck_char(item[0])
            char = item[0]
            if j < len(result_sequences[i]):
                out = [char, label_voc[result_sequences[i][j]]]
            else:
                out = [char, 'O']
            templist.append(out)
        outlist.append(templist)
    return outlist

Exemplo n.º 10

0

Exibir arquivo

# 加载数据

# 加载 vocs
path_vocs = []
for feature_name in feature_names:
    path_vocs.append(config['data_params']['voc_params'][feature_name]['path'])
path_vocs.append(config['data_params']['voc_params']['label']['path'])
vocs = load_vocs(path_vocs)

# 加载训练数据
sep_str = config['data_params']['sep']
assert sep_str in ['table', 'space']
sep = '\t' if sep_str == 'table' else ' '
data_dict = init_data(path=config['data_params']['path_train'],
                      feature_names=feature_names,
                      sep=sep,
                      vocs=vocs,
                      max_len=config['model_params']['sequence_length'],
                      model='train')

# 训练模型
model = SequenceLabelingModel(
    sequence_length=config['model_params']['sequence_length'],
    nb_classes=config['model_params']['nb_classes'],
    nb_hidden=config['model_params']['bilstm_params']['num_units'],
    feature_weight_shape_dict=feature_weight_shape_dict,
    feature_init_weight_dict=feature_init_weight_dict,
    feature_weight_dropout_dict=feature_weight_dropout_dict,
    dropout_rate=config['model_params']['dropout_rate'],
    nb_epoch=config['model_params']['nb_epoch'],
    feature_names=feature_names,
    batch_size=config['model_params']['batch_size'],