Пример #1
0
def read_datasets():
    print("[Start Dataset Reading.]")
    data = {}
    # load word2vec model
    fasttext = load_w2v(word2vec_path)
    #fasttext = load_fasttext(fasttext_path)
    print("[Load Word2Vec model.]")

    # load normalized word embeddings
    norm_embedding = tool.norm_matrix(fasttext.wv.syn0)
    data['embedding'] = norm_embedding
    print("[Load normalized word embedding.]")

    # preprocess seen and unseen labels
    ex_dict, ex_vec = process_label(ex_intent, fasttext)
    em_dict, em_vec = process_label(em_intent, fasttext)
    print("[Preprocess labels.]")

    # trans data into embedding vectors
    max_len = 0
    x_ex, y_ex, ex_len, max_len = load_vec(training_data_path, fasttext,
                                           ex_dict, max_len)
    x_em, y_em, em_len, max_len = load_vec(test_data_path, fasttext, em_dict,
                                           max_len)

    label_ex, label_ex_len = load_vec_label(ex_intent, fasttext, max_len)
    label_em, label_em_len = load_vec_label(em_intent, fasttext, max_len)
    # existing intent

    #data['ex_intent']=ex_intent
    #data['em_intent']=em_intent

    data['label_ex'] = label_ex
    data['label_ex_len'] = label_ex_len

    data['label_em'] = label_em
    data['label_em_len'] = label_em_len

    data['x_ex'] = x_ex
    data['y_ex'] = y_ex

    data['ex_len'] = ex_len
    data['ex_vec'] = ex_vec
    data['ex_dict'] = ex_dict

    # emerging intent
    data['x_em'] = x_em
    data['y_em'] = y_em

    data['em_len'] = em_len
    data['em_vec'] = em_vec
    data['em_dict'] = em_dict

    data['max_len'] = max_len
    data['ex_label'] = get_label(data)
    # [0.0, 0.0, ..., 1.0, ..., 0.0]

    print("[Complete Dataset Reading.]")
    return data
def read_datasets(from_data_dir, to_data_dir, args):
    # Read datasets and make the data dictionary containing essential data objects for training.
    print("Splitting raw data and saving into txt files...")
    if args.mode == 'seen_class':
        train_data_path, valid_data_path = split_seen_class_data(
            from_data_dir, to_data_dir, args)
    else:
        train_data_path, valid_data_path = split_zero_shot_data(
            from_data_dir, to_data_dir, args)

    # Setting configurations.
    tokenizer = None
    w2v = None
    bert_config = None
    if args.model_type == 'bert_capsnet' or args.model_type == 'basic_capsnet':
        print("Loading BertTokenizer...")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        if args.model_type == 'bert_capsnet':
            bert_config = BertConfig.from_pretrained("bert-base-uncased")
            args.max_len = min(args.max_len,
                               bert_config.max_position_embeddings)
    else:
        w2v_path = f'{args.data_dir}/GoogleNews-vectors-negative300.bin'
        assert os.path.isfile(
            w2v_path
        ), f"There is no Korean w2v file. Please download w2v file from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit, extract it and put it in {args.data_dir}."

        w2v = load_w2v(w2v_path)

    # Keep the index of padding.
    if tokenizer is None:
        args.pad_id = 0
    else:
        args.pad_id = tokenizer.get_vocab()['[PAD]']

    # Preprocess train/test data
    print("Preprocessing train/test data...")
    train_set = CustomDataset(train_data_path, tokenizer, w2v, args.max_len,
                              args.pad_id)
    valid_set = CustomDataset(valid_data_path, tokenizer, w2v, args.max_len,
                              args.pad_id)

    # Depending the model type, vocab_size, word_emb_size, and w2v object can be different.
    if args.model_type == 'bert_capsnet':
        args.vocab_size = len(tokenizer.get_vocab())
        args.word_emb_size = bert_config.hidden_size
        args.embedding = None
    elif args.model_type == 'basic_capsnet':
        args.vocab_size = len(tokenizer.get_vocab())
        args.word_emb_size = 300
        args.embedding = None
    elif args.model_type == 'w2v_capsnet':
        w2v_shape = w2v.wv.vectors.shape
        args.vocab_size = w2v_shape[0]
        args.word_emb_size = w2v_shape[1]
        args.embedding = tool.norm_matrix(w2v.syn0)

    return train_set, valid_set, args
Пример #3
0
def read_datasets():
    print("------------------read datasets begin-------------------")
    data = {}

    # load word2vec model
    print("------------------load word2vec begin-------------------")
    w2v = load_w2v(word2vec_path)
    print("------------------load word2vec end---------------------")

    # load normalized word embeddings
    embedding = w2v.syn0
    data['embedding'] = embedding
    norm_embedding = tool.norm_matrix(embedding)
    data['embedding'] = norm_embedding
    # pre process seen and unseen labels
    sc_dict, sc_vec = process_label(seen_intent, w2v)
    uc_dict, uc_vec = process_label(unseen_intent, w2v)
    # trans data into embedding vectors
    max_len = 0
    # jiayi
    # x_tr, y_tr, s_len, max_len = load_vec(
    #         training_data_path, w2v, sc_dict, max_len)
    # x_te, y_te, u_len, max_len = load_vec(
    #         test_data_path, w2v, uc_dict, max_len)
    x_tr, y_tr, s_len, max_len = load_vec(training_data_path, w2v, sc_dict,
                                          max_len)
    x_te, y_te, u_len, max_len = load_vec(test_data_path, w2v, uc_dict,
                                          max_len)

    data['x_tr'] = x_tr
    data['y_tr'] = y_tr

    data['s_len'] = s_len
    data['sc_vec'] = sc_vec
    data['sc_dict'] = sc_dict

    data['x_te'] = x_te
    data['y_te'] = y_te

    data['u_len'] = u_len
    data['uc_vec'] = uc_vec
    data['uc_dict'] = uc_dict

    data['max_len'] = max_len

    ind = get_label(data)
    data['s_label'] = ind  # [0.0, 0.0, ..., 1.0, ..., 0.0]
    print("------------------read datasets end---------------------")
    return data
Пример #4
0
def read_datasets(data_setting):
    print("------------------read datasets begin-------------------")

    data = dict()
    data['dataset'] = data_setting['dataset']

    # load data
    data_path = data_setting['data_prefix'] + data_setting['dataset_name']
    x_text, y_text, y, class_dict = load_data(data_path)

    # tokenize
    data['text_represent'] = data_setting['text_represent']
    data['key_pretrained'] = data_setting['key_pretrained']

    word2vec_path = data_setting['data_prefix'] + data_setting['wordvec_name']
    w2v = load_w2v(word2vec_path)
    if data['text_represent'] == 'w2v':
        x_pad, _, _ = tokenize_w2v(x_text, w2v)
        y_pad, data['n_vocab'], data['d_emb'] = tokenize_w2v(y_text, w2v)
        data['embedding'] = torch.from_numpy(tool.norm_matrix(w2v.syn0))
    elif data['text_represent'] == 'transformers':
        x_pad, data['n_vocab'], data['d_emb'] = tokenize_transformers(
            x_text, data['key_pretrained'])
        # TODO n_vocab not update by y
        y_pad, _, _ = tokenize_transformers(y_text, data['key_pretrained'])

    # split dataset
    print('split dataset')
    if data_setting['freeze_class']:
        label_order = data_setting['label_order']
        unseen_class = data_setting['unseen_class']
        seen_class = [x for x in label_order if x not in unseen_class]
    else:
        if data_setting['dataset'] == 'SMP18':
            del class_dict['聊天']
        class_freq_dict = {
            k: (y == class_dict[k]).nonzero().shape[0]
            for k in class_dict.keys()
        }
        class_freq_dict = {k: v for k, v in class_freq_dict.items() if v > 2}
        label_order = list(class_freq_dict.keys())
        label_freq_np = np.array([class_freq_dict[l] for l in label_order])
        label_freq_np = label_freq_np / sum(label_freq_np)
        n_c_tr = math.ceil(len(label_order) * data_setting['seen_class_prob'])
        seen_class = list(
            np.random.choice(label_order,
                             size=n_c_tr,
                             replace=False,
                             p=label_freq_np))
        unseen_class = [x for x in label_order if x not in seen_class]
        print("unseen_class:\n", unseen_class)

    # update class_dict and y (first seen and then unseen classes in class_dict)
    class_dict = dict()
    for c in seen_class:
        class_dict[c] = len(class_dict)
    for c in unseen_class:
        class_dict[c] = len(class_dict)
    y = [
        class_dict[label] if label in class_dict.keys() else -1
        for label in y_text
    ]
    y = torch.tensor(y)

    # get split index
    if data_setting['freeze_class']:
        data['id_split'] = data_setting['id_split']
        matlab_data = sio.loadmat(data_setting['data_prefix'] +
                                  data_setting['sim_name_withS'])

        idx_tr = (matlab_data['train_ind'][0, data_setting['id_split']] -
                  1).tolist()[0]
        idx_te = (matlab_data['test_ind'][0, data_setting['id_split']] -
                  1).tolist()[0]
        idx_tr = torch.LongTensor(idx_tr)
        idx_te = torch.LongTensor(idx_te)

        data['sim'] = matlab_data['similarity'][0, data_setting['id_split']]
        data['sim'] = torch.from_numpy(data['sim'])
    else:
        idx_tr = torch.tensor([]).long()
        idx_te = torch.tensor([]).long()
        for c in unseen_class:
            idx_c = (y == class_dict[c]).nonzero().squeeze(-1)
            if data_setting['test_mode'] == 'standard':
                idx_te = torch.cat([idx_te, idx_c], dim=0)
            else:
                # idx_te = torch.cat([idx_te, idx_c], dim=0)
                n_unseen_in_test = int(idx_c.shape[0] *
                                       data_setting['sample_in_test_prob'])
                idx_te = torch.cat([idx_te, idx_c[:n_unseen_in_test]], dim=0)
        for c in seen_class:
            idx_c = (y == class_dict[c]).nonzero().squeeze(-1)
            if data_setting['test_mode'] == 'standard':
                idx_tr = torch.cat([idx_tr, idx_c], dim=0)
            elif data_setting['test_mode'] == 'general':
                idx_perm = torch.randperm(idx_c.shape[0])
                idx_c = idx_c[idx_perm]
                n_seen_in_test = int(idx_c.shape[0] *
                                     data_setting['sample_in_test_prob'])
                idx_tr = torch.cat([idx_tr, idx_c[n_seen_in_test:]])
                idx_te = torch.cat([idx_te, idx_c[:n_seen_in_test]])

    # shuffle data
    idx_tr = idx_tr[torch.randperm(idx_tr.shape[0])]
    idx_te = idx_te[torch.randperm(idx_te.shape[0])]

    # get padded class represent
    if data['text_represent'] == 'w2v':
        # class_pad, data['n_vocab'], data['d_emb'] = tokenize_w2v(seen_class + unseen_class, w2v)
        class_pad, data['n_vocab'], data['d_emb'] = tokenize_w2v(
            seen_class, w2v)
    else:
        class_pad, _, _ = tokenize_transformers(seen_class + unseen_class,
                                                data['key_pretrained'])

    # pre process seen and unseen labels
    # if data_setting['text_represent'] == 'w2v':
    class_id_startpoint = 0
    sc_dict, sc_vec = process_label(seen_class, w2v, class_id_startpoint)
    if data_setting['test_mode'] == 'general':
        uc_dict, uc_vec = process_label(unseen_class, w2v,
                                        class_id_startpoint + len(sc_dict))
        uc_dict = dict(sc_dict, **uc_dict)
        uc_vec = np.concatenate([sc_vec, uc_vec], axis=0)
    else:
        uc_dict, uc_vec = process_label(unseen_class, w2v, class_id_startpoint)
    data['sc_dict'] = sc_dict
    data['sc_vec'] = sc_vec
    data['uc_dict'] = uc_dict
    data['uc_vec'] = uc_vec

    # finalize data package
    data['n_tr'] = idx_tr.shape[0]
    data['x_tr'] = x_pad[idx_tr]
    data['y_tr'] = y[idx_tr]
    data['y_ind'] = torch.zeros(data['n_tr'], len(seen_class)).scatter_(
        1, data['y_tr'].unsqueeze(1), 1)

    data['n_te'] = idx_te.shape[0]
    data['x_te'] = x_pad[idx_te]
    data['y_te'] = y[idx_te]

    data['seen_class'] = seen_class
    data['unseen_class'] = unseen_class
    data['class_padded'] = class_pad

    print("------------------read dataset end---------------------")
    return data