Exemplo n.º 1
0
def init_data(path, feature_names, vocs, max_len,word_len,word_id, model='train', sep='\t'):
    """
    加载数据
    Args:
        path: str, 数据路径
        feature_names: list of str, 特征名称
        vocs: list of dict
        max_len: int, 句子最大长度
        word_len: int, 单词最大
        model: str, in ('train', 'test')
        sep: str, 特征之间的分割符, default is '\t'
    Returns:
        data_dict: dict
    """
    # print(vocs)
    assert model in ('train', 'test')
    file_r = codecs.open(path, 'r', encoding='utf-8')
    sentences = file_r.read().strip().split('\n\n')
    print(sentences)
    sentence_count = len(sentences)
    print(sentence_count)
    feature_count = len(feature_names)
    data_dict = dict()
    for feature_name in feature_names:
        data_dict[feature_name] = np.zeros(
            (sentence_count, max_len), dtype='int32')
    data_dict['char']=np.zeros((len(sentences),max_len,word_len),dtype='int32')
    if model == 'train':
        data_dict['label'] = np.zeros((len(sentences), max_len), dtype='int32')
    for index, sentence in enumerate(sentences):
        items = sentence.split('\n')
        one_instance_items = []
        char_instance_item=[]
        [one_instance_items.append([]) for _ in range(len(feature_names)+1)]
        # 申请字典空间
        [char_instance_item.append([]) for _ in range(len(items))]
        for item_num,item in enumerate(items):
            feature_tokens = item.split(sep)
            # print(feature_tokens)
            for j in range(feature_count):
                one_instance_items[j].append(feature_tokens[j])
                if j==word_id-1:
                    for num,w in enumerate(feature_tokens[j]):
                        if num==word_len:break
                        char_instance_item[item_num].append(w)
            if model == 'train':
                one_instance_items[-1].append(feature_tokens[-1])
        # print(one_instance_items)
        # print(char_instance_item)
        for i in range(len(feature_names)):
            data_dict[feature_names[i]][index, :] = map_item2id(
                one_instance_items[i], vocs[i], max_len)
        for i in range(len(items)):
            data_dict['char'][index][i]=map_item2id(char_instance_item[i],vocs[-1],word_len)
        if model == 'train':
            data_dict['label'][index, :] = map_item2id(
                one_instance_items[-1], vocs[-2], max_len)
    file_r.close()
    # print(data_dict)
    return data_dict
Exemplo n.º 2
0
def init_data(path, feature_names, vocs, max_len, model='train',
              use_char_feature=False, word_len=None, sep='\t'):
    """
    加载数据(待优化,目前是一次性加载整个数据集)
    Args:
        path: str, 数据路径
        feature_names: list of str, 特征名称
        vocs: list of dict
        max_len: int, 句子最大长度
        model: str, in ('train', 'test')
        use_char_feature: bool,是否使用char特征
        word_len: None or int,单词最大长度
        sep: str, 特征之间的分割符, default is '\t'
    Returns:
        data_dict: dict
    """
    assert model in ('train', 'test')
    file_r = codecs.open(path, 'r', encoding='utf-8')
    sentences = file_r.read().strip().split('\r\n\r\n')
    sentence_count = len(sentences)
    feature_count = len(feature_names)
    data_dict = dict()
    for feature_name in feature_names:
        data_dict[feature_name] = np.zeros((sentence_count, max_len), dtype='int32')
    # char feature
    if use_char_feature:
        data_dict['char'] = np.zeros(
            (sentence_count, max_len, word_len), dtype='int32')
        char_voc = vocs.pop(0)
    if model == 'train':
        data_dict['label'] = np.zeros((len(sentences), max_len), dtype='int32')
    for index, sentence in enumerate(sentences):
        items = sentence.split('\r\n')
        one_instance_items = []
        [one_instance_items.append([]) for _ in range(len(feature_names)+1)]
        for item in items:
            feature_tokens = item.split(sep)
            for j in range(feature_count):
                one_instance_items[j].append(feature_tokens[j])
            if model == 'train':
                one_instance_items[-1].append(feature_tokens[-1])
        for i in range(len(feature_names)):
            data_dict[feature_names[i]][index, :] = map_item2id(
                one_instance_items[i], vocs[i], max_len)
        if use_char_feature:
            for i, word in enumerate(one_instance_items[0]):
                if i >= max_len:
                    break
                data_dict['char'][index][i, :] = map_item2id(
                    word, char_voc, word_len)
        if model == 'train':
            data_dict['label'][index, :] = map_item2id(
                one_instance_items[-1], vocs[-1], max_len)
        sys.stdout.write('loading data: %d\r' % index)
    file_r.close()
    return data_dict
def init_data(path, feature_names, vocs, max_len, model='train', sep='\t'):
    """
    加载数据(待优化,目前是一次性加载整个数据集)
    Args:
        path: str, 数据路径
        feature_names: list of str, 特征名称
        vocs: list of dict
        max_len: int, 句子最大长度
        model: str, in ('train', 'test')
        sep: str, 特征之间的分割符, default is '\t'
    Returns:
        data_dict: dict
    """
    assert model in ('train', 'test')
    file_r = codecs.open(path, 'r', encoding='utf-8')
    sentences = file_r.read().strip().split('\n\n')
    sentence_count = len(sentences)
    feature_count = len(feature_names)
    data_dict = dict()
    for feature_name in feature_names:
        data_dict[feature_name] = np.zeros((sentence_count, max_len),
                                           dtype='int32')
    if model == 'train':
        data_dict['label'] = np.zeros((len(sentences), max_len), dtype='int32')
    for index, sentence in enumerate(sentences):
        items = sentence.split('\n')

        one_instance_items = []
        [one_instance_items.append([]) for _ in range(len(feature_names) + 1)]
        for item in items:
            feature_tokens = item.split(sep)
            for j in range(feature_count):
                one_instance_items[j].append(feature_tokens[j])
            if model == 'train':
                one_instance_items[-1].append(feature_tokens[-1])
        for i in range(len(feature_names)):
            data_dict[feature_names[i]][index, :] = map_item2id(
                one_instance_items[i], vocs[i], max_len)
        if model == 'train':
            data_dict['label'][index, :] = map_item2id(one_instance_items[-1],
                                                       vocs[-1], max_len)
    file_r.close()

    return data_dict
Exemplo n.º 4
0
def load_session_infer_data(path, feature_names, vocs, max_len, model='test'):
    assert model in ['train', 'test']
    assert model == 'test'

    fr = open(path, 'r', encoding='utf-8')
    samples = fr.read().strip().split('\n\n')
    print('number of samples', len(samples))
    data_dict = collections.OrderedDict()

    for i, sample in enumerate(samples):
        sentences = sample.split('\n')
        ss = sentences[0].split('\t')
        assert len(ss) == 3

        sid = ss[0]
        intent = None
        feat_dict = {}
        for feature_name in feature_names:
            feat_dict[feature_name] = []
        slot = []
        for sentence in sentences[1:]:
            ss = sentence.split('\t')
            for i, feat_name in enumerate(feature_names):
                feat_dict[feat_name].append(ss[i])
            if model == 'train':
                slot += [ss[-1]]
        if sid not in data_dict:
            data_dict[sid] = []
        data_dict[sid].append((intent, slot, feat_dict))

    # index all features
    max_turn = max([len(data_dict[x]) for x in data_dict])
    print('number of sessions', len(data_dict))
    print('max turn of sessions', max_turn)

    idx_dict = collections.OrderedDict()
    for sid in data_dict:
        session_list = data_dict[sid]
        session_x = []
        for label, slot, feat_dict in session_list:
            feat_idx_dict = dict()
            for i, feat_name in enumerate(feature_names):
                feat_idx_dict[feat_name] = map_item2id(feat_dict[feat_name],
                                                       vocs[i], max_len)
            session_x += [[feat_idx_dict, None, None]]
        idx_dict[sid] = session_x
    return idx_dict
def init_data(feature_names,
              vocs,
              max_len,
              model='train',
              path=None,
              test_sens=None,
              use_char_feature=False,
              word_len=None,
              sep='\t'):
    """
    加载数据(待优化,目前是一次性加载整个数据集)
    Args:
        path: str, 数据路径
        test_sens: list, [[[u'白带常规', u 'ni', u 'S_ex_name'],[u ':', u 'w', u 'O'],[],...],[[],[],[],...],...]
        feature_names: list of str, 特征名称
        vocs: list of dict
        max_len: int, 句子最大长度
        model: str, in ('train', 'test')
        use_char_feature: bool,是否使用char特征
        word_len: None or int,单词最大长度
        sep: str, 特征之间的分割符, default is '\t'
    Returns:
        data_dict: dict
    """
    assert model in ('train', 'test')
    if model == 'train':
        with codecs.open(path, 'r', encoding='utf8') as file_r:
            sentences = file_r.read().strip().split('\n\n')
        sentences = [[j.split(sep) for j in sen.split('\n')]
                     for sen in sentences]
    else:
        if not test_sens:
            raise ValueError('请保证测试语料非空!!!')
        sentences = test_sens

    sentences_count = len(sentences)
    print('sentences_count1', sentences_count)
    feature_count = len(feature_names)
    data_dict = dict()
    for feature_name in feature_names:
        data_dict[feature_name] = np.zeros((sentences_count, max_len),
                                           dtype='int32')

    #char feature
    if use_char_feature:
        data_dict['char'] = np.zeros((sentences_count, max_len, word_len),
                                     dtype='int32')
        char_voc = vocs.pop(0)
    if model == 'train':
        data_dict['label'] = np.zeros((len(sentences), max_len), dtype='int32')
    for index, items in enumerate(sentences):
        one_instance_items = []
        [one_instance_items.append([]) for _ in range(len(feature_names) + 1)]
        for feature_tokens in items:
            for j in range(feature_count):
                one_instance_items[j].append(feature_tokens[j])
            if model == 'train':
                one_instance_items[-1].append(feature_tokens[-1])
        for i in range(len(feature_names)):
            data_dict[feature_names[i]][index, :] = map_item2id(
                one_instance_items[i], vocs[i], max_len)
        if use_char_feature:
            for i, word in enumerate(one_instance_items[0]):
                if i >= max_len:
                    break
                data_dict['char'][index][i, :] = map_item2id(
                    word, char_voc, word_len)
        if model == 'train':
            data_dict['label'][index, :] = map_item2id(one_instance_items[-1],
                                                       vocs[-1], max_len)
            print('loading data: %d\r' % index)

    return data_dict
Exemplo n.º 6
0
 def predict(self, query):
     items = [c for c in query]
     data_dict = {'f1': np.zeros((1, self.max_len))}
     data_dict['f1'][0, :] = map_item2id(items, self.vocs[0], self.max_len)
     res = self.model.predict(data_dict)
     return self.label_voc[res[0][0] + 1]
Exemplo n.º 7
0
def init_data(path, feature_names, vocs, max_len, model='train',
              use_char_feature=False, word_len=None, sep='\t'):
    """
    加载数据(待优化,目前是一次性加载整个数据集)
    Args:
        path: str, 数据路径
        feature_names: list of str, 特征名称
        vocs: list of dict
        max_len: int, 句子最大长度
        model: str, in ('train', 'test')
        use_char_feature: bool,是否使用char特征
        word_len: None or int,单词最大长度
        sep: str, 特征之间的分割符, default is '\t'
    Returns:
        data_dict: dict
    """
    assert model in ('train', 'test')  # 限定数据集类型,训练集或测试集
    file_r = codecs.open(path, 'r', encoding='utf-8')
    sentences = file_r.read().strip().split('\n\n')  # 将训练集切为以句子为单位的列表

    sentence_count = len(sentences)  # 句子数
    print "sentence number:", sentence_count
    feature_count = len(feature_names)

    # 初始化数据集,data_dict存放不同feature和trainset标签的矩阵
    data_dict = dict()
    for feature_name in feature_names:
        data_dict[feature_name] = np.zeros((sentence_count, max_len), dtype='int32')
    # char feature
    if use_char_feature:
        data_dict['char'] = np.zeros(
            (sentence_count, max_len, word_len), dtype='int32')
        char_voc = vocs.pop(0)
    if model == 'train':  # 训练集包含标签
        data_dict['label'] = np.zeros((len(sentences), max_len), dtype='int32')
    for index, sentence in enumerate(sentences):
        items = sentence.split('\n') # 取句子的元素(特征,标签)
        one_instance_items = [] # 用以分开存放一个句子的特征向量(list),和标签向量(list)
        [one_instance_items.append([]) for _ in range(len(feature_names)+1)]

        for item in items:
            if item ==u"":
                continue
            feature_tokens = item.split(sep)  # 根据数据集中间隔符,将特征和标签分开

            for j in range(feature_count):

                one_instance_items[j].append(feature_tokens[j])

            if model == 'train':
                one_instance_items[-1].append(feature_tokens[-1])
        for i in range(len(feature_names)):
            # print data_dict[feature_names[i]][index]
            # 将数据集中各特证和标签,转化为int型id
            data_dict[feature_names[i]][index, :] = map_item2id(
                one_instance_items[i], vocs[i], max_len)
            # print data_dict[feature_names[i]][index]


        if use_char_feature:
            for i, word in enumerate(one_instance_items[0]):
                if i >= max_len:
                    break
                data_dict['char'][index][i, :] = map_item2id(
                    word, char_voc, word_len)

        if model == 'train':
            data_dict['label'][index, :] = map_item2id(
                one_instance_items[-1], vocs[-1], max_len)
        sys.stdout.write('loading data: %d\r' % index)
    file_r.close()
    return data_dict
Exemplo n.º 8
0
def load_session_data(path, feature_names, vocs, max_len, model='train'):
    assert model in ['train', 'test']
    fr = open(path, 'r', encoding='utf-8')
    samples = fr.read().strip().split('\n\n')
    print('number of samples', len(samples))
    data_dict = collections.defaultdict(list)

    for i, sample in enumerate(samples):
        sentences = sample.split('\n')
        ss = sentences[0].split('\t')
        if model == 'train':
            assert len(ss) == 4
        else:
            assert len(ss) == 3

        sid = ss[0]
        intent = None
        if model == 'train':
            intent = ss[3]
        feat_dict = {}
        for feature_name in feature_names:
            feat_dict[feature_name] = []
        slot = []
        for sentence in sentences[1:]:
            ss = sentence.split('\t')
            for i, feat_name in enumerate(feature_names):
                feat_dict[feat_name].append(ss[i])
            if model == 'train':
                slot += [ss[-1]]

        # data_dict[sid].append((intent, slot, feat_dict))
        data_dict[sid] = (intent, slot, feat_dict)

    # index all features
    # max_turn = max([len(data_dict[x]) for x in data_dict])
    print('number of sessions', len(data_dict))
    # print('max turn of sessions', max_turn)

    idx_dict = dict()
    for sid in data_dict:
        session_list = data_dict[sid]
        # session_x = []
        # for label, slot, feat_dict in session_list:
        #     label_idx = INTENT_DIC.get(label, 0)
        #     slot_idx = map_item2id(slot, vocs[-1], max_len)
        #     length = len(slot)
        #     feat_idx_dict = dict()
        #     for i, feat_name in enumerate(feature_names):
        #         assert length == len(feat_dict[feat_name])
        #         feat_idx_dict[feat_name] = map_item2id(feat_dict[feat_name], vocs[i], max_len)
        #     session_x += [[feat_idx_dict, label_idx, slot_idx]]
        # idx_dict[sid] = session_x
        label, slot, feat_dict = session_list
        label_idx = INTENT_DIC.get(label, 0)
        slot_idx = map_item2id(slot, vocs[-1], max_len)
        length = len(slot)
        feat_idx_dict = dict()
        for i, feat_name in enumerate(feature_names):
            # assert length == len(feat_dict[feat_name])
            feat_idx_dict[feat_name] = map_item2id(feat_dict[feat_name],
                                                   vocs[i], max_len)
        session_x = [feat_idx_dict, label_idx, slot_idx]
        idx_dict[sid] = session_x
    return idx_dict