Exemplo n.º 1
0
    def test_gpt2_embedding(self):
        weight_path = 'test/data_for_tests/embedding/small_gpt2'
        vocab = Vocabulary().add_word_lst("this is a texta sentence".split())
        embed = GPT2Embedding(vocab,
                              model_dir_or_name=weight_path,
                              word_dropout=0.1)
        requires_grad = embed.requires_grad
        embed.requires_grad = not requires_grad
        embed.train()
        words = torch.LongTensor([[2, 3, 4, 0]])
        result = embed(words)
        self.assertEqual(result.size(), (1, 4, 16))

        embed = GPT2Embedding(vocab,
                              model_dir_or_name=weight_path,
                              word_dropout=0.1,
                              only_use_pretrain_bpe=False,
                              language_model=True)
        embed.eval()
        words = torch.LongTensor([[2, 3, 4, 0]])
        result = embed(words)
        self.assertEqual(result.size(), (1, 4, 16))
        embed.get_lm_loss()

        vocab.add_word("NotInGpt2")
        embed = GPT2Embedding(vocab,
                              model_dir_or_name=weight_path,
                              word_dropout=0.1,
                              only_use_pretrain_bpe=False,
                              auto_truncate=True,
                              min_freq=1)
        words = torch.LongTensor([[2, 3, 4, 0] * 20])
        result = embed(words)
        self.assertEqual(result.size(), (1, 80, 16))
Exemplo n.º 2
0
def get_dataset(data_path):
    print('Getting dataset...')

    poetry = []
    with open(data_path, 'r', encoding='utf-8') as f:
        poem = ''
        for line in f:
            if len(line) <= 1:
                ins = Instance(text=poem)
                if len(poem) > 10:
                    poetry.append(ins)
                poem = ''
            else:
                poem += line.strip('\n')
    # print(poetry[0])

    data = DataSet(data=poetry)
    print("Original data:", data[0])

    vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>')
    vocabulary.add_word('<eos>')
    vocabulary.add_word('<START>')
    data.apply(lambda x: [vocabulary.add(char) for char in x['text']])
    vocabulary.build_vocab()
    print('pad:', vocabulary.to_index('<pad>'))
    print('Vocab size:', len(vocabulary))

    data.apply(lambda x: [vocabulary.to_index(char) for char in x['text']],
               new_field_name='text')
    data.apply(lambda x: [vocabulary.to_index('<START>')] + x['text'] +
               [vocabulary.to_index('<eos>')],
               new_field_name='text')
    data.apply(
        lambda x: x['text'][0:min(config.sequence_length, len(x['text']))],
        new_field_name='text')
    data.apply(lambda x: [vocabulary.to_index('<pad>')] *
               (config.sequence_length - len(x['text'])) + x['text'],
               new_field_name='text')
    data.apply(lambda x: x['text'][0:-1], new_field_name='input')
    data.apply(lambda x: x['text'][1:], new_field_name='target')
    data.set_input('input')
    data.set_target('target')

    # length = config.sequence_length
    # for i, d in enumerate(data):
    #     if length != len(d['text']):
    #         print("wrong!")
    # exit()

    train_data, dev_data = data.split(0.2)
    print('Train data size:', len(train_data))
    print('Dev data size:', len(dev_data))
    print("Train data:", train_data[20])
    # print("Dev data:", dev_data[0])

    return train_data, dev_data, vocabulary
Exemplo n.º 3
0
 def test_load_with_vocab(self):
     vocab = Vocabulary()
     glove = "test/data_for_tests/glove.6B.50d_test.txt"
     word2vec = "test/data_for_tests/word2vec_test.txt"
     vocab.add_word('the')
     vocab.add_word('none')
     g_m = EmbedLoader.load_with_vocab(glove, vocab)
     self.assertEqual(g_m.shape, (4, 50))
     w_m = EmbedLoader.load_with_vocab(word2vec, vocab, normalize=True)
     self.assertEqual(w_m.shape, (4, 50))
     self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 4)
Exemplo n.º 4
0
    def test_case12(self):
        # 测试能否通过vocab生成转移矩阵
        from fastNLP.modules.decoder.crf import allowed_transitions

        id2label = {0: 'B', 1: 'I', 2: 'O'}
        vocab = Vocabulary(unknown=None, padding=None)
        for idx, tag in id2label.items():
            vocab.add_word(tag)
        expected_res = {(0, 0), (0, 1), (0, 2), (0, 4), (1, 0), (1, 1), (1, 2), (1, 4), (2, 0), (2, 2),
                        (2, 4), (3, 0), (3, 2)}
        self.assertSetEqual(expected_res, set(allowed_transitions(vocab, include_start_end=True)))

        id2label = {0: 'B', 1: 'M', 2: 'E', 3: 'S'}
        vocab = Vocabulary(unknown=None, padding=None)
        for idx, tag in id2label.items():
            vocab.add_word(tag)
        expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 5), (3, 0), (3, 3), (3, 5), (4, 0), (4, 3)}
        self.assertSetEqual(expected_res, set(
            allowed_transitions(vocab, include_start_end=True)))

        id2label = {0: 'B', 1: 'I', 2: 'O', 3: '<pad>', 4: "<unk>"}
        vocab = Vocabulary()
        for idx, tag in id2label.items():
            vocab.add_word(tag)
        allowed_transitions(vocab, include_start_end=True)

        labels = ['O']
        for label in ['X', 'Y']:
            for tag in 'BI':
                labels.append('{}-{}'.format(tag, label))
        id2label = {idx: label for idx, label in enumerate(labels)}
        expected_res = {(0, 0), (0, 1), (0, 3), (0, 6), (1, 0), (1, 1), (1, 2), (1, 3), (1, 6), (2, 0), (2, 1),
                        (2, 2), (2, 3), (2, 6), (3, 0), (3, 1), (3, 3), (3, 4), (3, 6), (4, 0), (4, 1), (4, 3),
                        (4, 4), (4, 6), (5, 0), (5, 1), (5, 3)}
        vocab = Vocabulary(unknown=None, padding=None)
        for idx, tag in id2label.items():
            vocab.add_word(tag)
        self.assertSetEqual(expected_res, set(allowed_transitions(vocab, include_start_end=True)))

        labels = []
        for label in ['X', 'Y']:
            for tag in 'BMES':
                labels.append('{}-{}'.format(tag, label))
        id2label = {idx: label for idx, label in enumerate(labels)}
        vocab = Vocabulary(unknown=None, padding=None)
        for idx, tag in id2label.items():
            vocab.add_word(tag)
        expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 4), (2, 7), (2, 9), (3, 0), (3, 3), (3, 4),
                        (3, 7), (3, 9), (4, 5), (4, 6), (5, 5), (5, 6), (6, 0), (6, 3), (6, 4), (6, 7), (6, 9), (7, 0),
                        (7, 3), (7, 4), (7, 7), (7, 9), (8, 0), (8, 3), (8, 4), (8, 7)}
        self.assertSetEqual(expected_res, set(
            allowed_transitions(vocab, include_start_end=True)))
Exemplo n.º 5
0
def load_snli(path, files):
    loader = SNLILoader()
    ds_list = [loader.load(os.path.join(path, f)) for f in files]
    word_v = Vocabulary(min_freq=2)
    tag_v = Vocabulary(unknown=None, padding=None)
    for ds in ds_list:
        ds.apply(lambda x: [w.lower() for w in x['words1']],
                 new_field_name='words1')
        ds.apply(lambda x: [w.lower() for w in x['words2']],
                 new_field_name='words2')
    update_v(word_v, ds_list[0], 'words1')
    update_v(word_v, ds_list[0], 'words2')
    ds_list[0].apply(lambda x: tag_v.add_word(x['target']),
                     new_field_name=None)

    def process_data(ds):
        to_index(word_v, ds, 'words1', C.INPUTS(0))
        to_index(word_v, ds, 'words2', C.INPUTS(1))
        ds.apply(lambda x: tag_v.to_index(x['target']),
                 new_field_name=C.TARGET)
        ds.apply(lambda x: x[C.INPUTS(0)][:MAX_LEN],
                 new_field_name=C.INPUTS(0))
        ds.apply(lambda x: x[C.INPUTS(1)][:MAX_LEN],
                 new_field_name=C.INPUTS(1))
        ds.apply(lambda x: len(x[C.INPUTS(0)]), new_field_name=C.INPUT_LENS(0))
        ds.apply(lambda x: len(x[C.INPUTS(1)]), new_field_name=C.INPUT_LENS(1))
        ds.set_input(C.INPUTS(0), C.INPUTS(1), C.INPUT_LENS(0),
                     C.INPUT_LENS(1))
        ds.set_target(C.TARGET)

    for i in range(len(ds_list)):
        process_data(ds_list[i])
    return ds_list, word_v, tag_v
Exemplo n.º 6
0
def load_sst(path, files):
    loaders = [
        SSTLoader(subtree=sub, fine_grained=True)
        for sub in [True, False, False]
    ]
    ds_list = [
        loader.load(os.path.join(path, fn))
        for fn, loader in zip(files, loaders)
    ]
    word_v = Vocabulary(min_freq=2)
    tag_v = Vocabulary(unknown=None, padding=None)
    for ds in ds_list:
        ds.apply(lambda x: [w.lower() for w in x['words']],
                 new_field_name='words')
    ds_list[0].drop(lambda x: len(x['words']) < 3)
    update_v(word_v, ds_list[0], 'words')
    ds_list[0].apply(lambda x: tag_v.add_word(x['target']),
                     new_field_name=None)

    def process_data(ds):
        to_index(word_v, ds, 'words', C.INPUT)
        ds.apply(lambda x: tag_v.to_index(x['target']),
                 new_field_name=C.TARGET)
        ds.apply(lambda x: x[C.INPUT][:MAX_LEN], new_field_name=C.INPUT)
        ds.apply(lambda x: len(x['words']), new_field_name=C.INPUT_LEN)
        ds.set_input(C.INPUT, C.INPUT_LEN)
        ds.set_target(C.TARGET)

    for i in range(len(ds_list)):
        process_data(ds_list[i])
    return ds_list, word_v, tag_v
Exemplo n.º 7
0
    def test_no_entry(self):
        # 先建立vocabulary,然后变化no_create_entry, 测试能否正确识别
        text = [
            "FastNLP", "works", "well", "in", "most", "cases", "and", "scales",
            "well", "in", "works", "well", "in", "most", "cases", "scales",
            "well"
        ]
        vocab = Vocabulary()
        vocab.add_word_lst(text)

        self.assertFalse(vocab._is_word_no_create_entry('FastNLP'))
        vocab.add_word('FastNLP', no_create_entry=True)
        self.assertFalse(vocab._is_word_no_create_entry('FastNLP'))

        vocab.add_word('fastnlp', no_create_entry=True)
        self.assertTrue(vocab._is_word_no_create_entry('fastnlp'))
        vocab.add_word('fastnlp', no_create_entry=False)
        self.assertFalse(vocab._is_word_no_create_entry('fastnlp'))

        vocab.add_word_lst(['1'] * 10, no_create_entry=True)
        self.assertTrue(vocab._is_word_no_create_entry('1'))
        vocab.add_word('1')
        self.assertFalse(vocab._is_word_no_create_entry('1'))
Exemplo n.º 8
0
    def process(self, paths, config, load_vocab_file=True):
        """
        :param paths: dict  path for each dataset
        :param load_vocab_file: bool  build vocab (False) or load vocab (True)
        :return: DataBundle
            datasets: dict  keys correspond to the paths dict
            vocabs: dict  key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True)
            embeddings: optional
        """

        vocab_size = config.vocab_size

        def _merge_abstracts(abstracts):
            merged = []
            for abstract in abstracts:
                merged.extend(abstract[:self.max_concat_len] + [SEP])
            if len(abstracts) == 0:
                assert merged == []
            return merged[:-1]

        def _pad_graph_inputs(graph_inputs):
            pad_text_wd = []
            max_len = config.max_graph_enc_steps

            for graph_input in graph_inputs:
                if len(graph_input) < max_len:
                    pad_num = max_len - len(graph_input)
                    graph_input.extend([PAD_TOKEN] * pad_num)
                else:
                    graph_input = graph_input[:max_len]
                pad_text_wd.append(graph_input)

            if len(pad_text_wd) == 0:
                pad_text_wd.append([PAD_TOKEN] * max_len)

            return pad_text_wd

        def _get_nbr_input_len(input_wd):
            enc_len = [
                min(len(text), config.max_graph_enc_steps) for text in input_wd
            ]
            if len(enc_len) == 0:
                enc_len = [0]
            return enc_len

        def _pad_article(text_wd):
            token_num = len(text_wd)
            max_len = config.max_enc_steps
            if config.neighbor_process == "sep":
                max_len += self.max_concat_len * self.max_concat_num
            if token_num < max_len:
                padding = [PAD_TOKEN] * (max_len - token_num)
                article = text_wd + padding
            else:
                article = text_wd[:max_len]
            return article

        def _split_list(input_list):
            return [text.split() for text in input_list]

        def sent_tokenize(abstract):
            abs_list = abstract.split(".")
            return [(abst + ".") for abst in abs_list[:-1]]

        def _article_token_mask(text_wd):
            max_enc_len = config.max_enc_steps
            if config.neighbor_process == "sep":
                max_enc_len += self.max_concat_len * self.max_concat_num
            token_num = len(text_wd)
            if token_num < max_enc_len:
                mask = [1] * token_num + [0] * (max_enc_len - token_num)
            else:
                mask = [1] * max_enc_len
            return mask

        def generate_article_input(text, abstracts):
            if config.neighbor_process == "sep":
                text_wd = text.split()[:config.max_enc_steps]
                text_wd.append(SEP)
                abstracts_wd = _merge_abstracts(abstracts)
                return text_wd + abstracts_wd
            else:
                return text.split()

        def generate_graph_inputs(graph_struct):

            graph_inputs_ = [
                graph_strut_dict[pid][config.graph_input_type]
                for pid in graph_struct
            ]
            return _split_list(graph_inputs_[1:])

        def generate_graph_structs(paper_id):
            sub_graph_dict = {}
            sub_graph_set = []

            n_hop = config.n_hop
            max_neighbor_num = config.max_neighbor_num
            k_nbrs = _k_hop_neighbor(paper_id, n_hop, max_neighbor_num)
            for sub_g in k_nbrs:
                sub_graph_set += sub_g

            for node in sub_graph_set:
                sub_graph_dict[node] = []

            for sub_g in k_nbrs:
                for centre_node in sub_g:
                    nbrs = graph_strut_dict[centre_node]['references']
                    c_nbrs = list(set(nbrs).intersection(sub_graph_set))
                    sub_graph_dict[centre_node].extend(c_nbrs)
                    for c_nbr in c_nbrs:
                        sub_graph_dict[c_nbr].append(centre_node)
            # in python 3.6, the first in subgraph dict is source paper
            return sub_graph_dict

        def _k_hop_neighbor(paper_id, n_hop, max_neighbor):
            sub_graph = [[] for _ in range(n_hop + 1)]
            level = 0
            visited = set()
            q = deque()
            q.append([paper_id, level])
            curr_node_num = 0
            while len(q) != 0:
                paper_first = q.popleft()
                paper_id_first, level_first = paper_first
                if level_first > n_hop:
                    return sub_graph
                sub_graph[level_first].append(paper_id_first)
                curr_node_num += 1
                if curr_node_num > max_neighbor:
                    return sub_graph
                visited.add(paper_id_first)
                for pid in graph_strut_dict[paper_id_first]["references"]:
                    if pid not in visited and pid in graph_strut_dict:
                        q.append([pid, level_first + 1])
                        visited.add(pid)

            return sub_graph

        def generate_dgl_graph(paper_id, graph_struct, nodes_num):
            g = dgl.DGLGraph()
            assert len(graph_struct) == nodes_num

            g.add_nodes(len(graph_struct))
            pid2idx = {}
            for index, key_node in enumerate(graph_struct):
                pid2idx[key_node] = index
            assert pid2idx[paper_id] == 0

            for index, key_node in enumerate(graph_struct):
                neighbor = [pid2idx[node] for node in graph_struct[key_node]]
                # add self loop
                neighbor.append(index)
                key_nodes = [index] * len(neighbor)
                g.add_edges(key_nodes, neighbor)
            return g

        train_ds = None
        dataInfo = self.load(paths)

        # pop nodes in train graph in inductive setting
        if config.mode == "test" and self.setting == "inductive":
            dataInfo.datasets.pop("train")

        graph_strut_dict = {}
        for key, ds in dataInfo.datasets.items():
            for ins in ds:
                graph_strut_dict[ins["paper_id"]] = ins

        logger.info(f"the input graph G_v has {len(graph_strut_dict)} nodes")

        for key, ds in dataInfo.datasets.items():
            # process summary
            ds.apply(lambda x: x['abstract'].split(),
                     new_field_name='summary_wd')
            ds.apply(lambda x: sent_tokenize(x['abstract']),
                     new_field_name='abstract_sentences')
            # generate graph

            ds.apply(lambda x: generate_graph_structs(x["paper_id"]),
                     new_field_name="graph_struct")
            ds.apply(lambda x: generate_graph_inputs(x["graph_struct"]),
                     new_field_name='graph_inputs_wd')

            ds.apply(lambda x: len(x["graph_inputs_wd"]) + 1,
                     new_field_name="nodes_num")
            # pad input
            ds.apply(lambda x: generate_article_input(x['introduction'], x[
                "graph_inputs_wd"]),
                     new_field_name='input_wd')
            ds.apply(lambda x: _article_token_mask(x["input_wd"]),
                     new_field_name="enc_len_mask")
            ds.apply(lambda x: sum(x["enc_len_mask"]),
                     new_field_name="enc_len")
            ds.apply(lambda x: _pad_article(x["input_wd"]),
                     new_field_name="pad_input_wd")

            ds.apply(lambda x: _get_nbr_input_len(x["graph_inputs_wd"]),
                     new_field_name="nbr_inputs_len")

            ds.apply(lambda x: _pad_graph_inputs(x["graph_inputs_wd"]),
                     new_field_name="pad_graph_inputs_wd")
            if key == "train":
                train_ds = ds

        vocab_dict = {}
        if not load_vocab_file:
            logger.info("[INFO] Build new vocab from training dataset!")
            if train_ds is None:
                raise ValueError("Lack train file to build vocabulary!")

            vocabs = Vocabulary(max_size=config.vocab_size - 2,
                                padding=PAD_TOKEN,
                                unknown=UNKNOWN_TOKEN)
            vocabs.from_dataset(train_ds,
                                field_name=["input_wd", "summary_wd"])
            vocabs.add_word(START_DECODING)
            vocabs.add_word(STOP_DECODING)
            vocab_dict["vocab"] = vocabs
            # save vocab
            with open(os.path.join(config.train_path, "vocab"),
                      "w",
                      encoding="utf8") as f:
                for w, idx in vocabs:
                    f.write(str(w) + "\t" + str(idx) + "\n")
            logger.info(
                "build new vocab ends.. please reRun the code with load_vocab = True"
            )
            exit(0)
        else:

            logger.info("[INFO] Load existing vocab from %s!" %
                        config.vocab_path)
            word_list = []
            cnt = 3  # pad and unk
            if config.neighbor_process == "sep":
                cnt += 1

            with open(config.vocab_path, 'r', encoding='utf8') as vocab_f:
                for line in vocab_f:
                    pieces = line.split("\t")
                    word_list.append(pieces[0])
                    cnt += 1
                    if cnt > vocab_size:
                        break

            vocabs = Vocabulary(max_size=vocab_size,
                                padding=PAD_TOKEN,
                                unknown=UNKNOWN_TOKEN)
            vocabs.add_word_lst(word_list)
            vocabs.add(START_DECODING)
            vocabs.add(STOP_DECODING)
            if config.neighbor_process == "sep":
                vocabs.add(SEP)
            vocabs.build_vocab()
            vocab_dict["vocab"] = vocabs

        logger.info(f"vocab size = {len(vocabs)}")
        assert len(vocabs) == config.vocab_size
        dataInfo.set_vocab(vocabs, "vocab")

        for key, dataset in dataInfo.datasets.items():
            # do not process the training set in test mode
            if config.mode == "test" and key == "train":
                continue

            data_dict = {
                "enc_input": [],
                "nbr_inputs": [],
                "graph": [],
                "dec_input": [],
                "target": [],
                "dec_len": [],
                "article_oovs": [],
                "enc_input_extend_vocab": [],
            }
            logger.info(
                f"start construct the input of the model for {key} set, please wait..."
            )
            for instance in dataset:
                graph_inputs = instance["pad_graph_inputs_wd"]
                abstract_sentences = instance["summary_wd"]
                enc_input = instance["pad_input_wd"]
                enc_input, nbr_inputs, dec_input, target, dec_len, article_oovs, enc_input_extend_vocab = \
                    getting_full_info(enc_input, graph_inputs, abstract_sentences, dataInfo.vocabs['vocab'], config)
                graph = generate_dgl_graph(instance["paper_id"],
                                           instance["graph_struct"],
                                           instance["nodes_num"])
                data_dict["graph"].append(graph)
                data_dict["enc_input"].append(enc_input)
                data_dict["nbr_inputs"].append(nbr_inputs)
                data_dict["dec_input"].append(dec_input)
                data_dict["target"].append(target)
                data_dict["dec_len"].append(dec_len)
                data_dict["article_oovs"].append(article_oovs)
                data_dict["enc_input_extend_vocab"].append(
                    enc_input_extend_vocab)

            dataset.add_field("enc_input", data_dict["enc_input"])
            dataset.add_field("nbr_inputs", data_dict["nbr_inputs"])
            dataset.add_field("dec_input", data_dict["dec_input"])
            dataset.add_field("target", data_dict["target"])
            dataset.add_field("dec_len", data_dict["dec_len"])
            dataset.add_field("article_oovs", data_dict["article_oovs"])
            dataset.add_field("enc_input_extend_vocab",
                              data_dict["enc_input_extend_vocab"])

            dataset.add_field("graph", data_dict["graph"])
            dataset.set_ignore_type(
                'graph')  # without this line, there may be some errors
            dataset.set_input("graph")

            dataset.set_input("nbr_inputs_len", "nbr_inputs", "enc_len",
                              "enc_input", "enc_len_mask", "dec_input",
                              "dec_len", "article_oovs", "nodes_num",
                              "enc_input_extend_vocab")
            dataset.set_target("target", "article_oovs", "abstract_sentences")

            dataset.delete_field('graph_inputs_wd')
            dataset.delete_field('pad_graph_inputs_wd')
            dataset.delete_field('input_wd')
            dataset.delete_field('pad_input_wd')
        logger.info("------load dataset over---------")
        return dataInfo, vocabs
Exemplo n.º 9
0
 def test_add_word(self):
     vocab = Vocabulary()
     for word in text:
         vocab.add_word(word)
     self.assertEqual(vocab.word_count, counter)
Exemplo n.º 10
0
    def test_only_use_pretrain_word(self):
        def check_word_unk(words, vocab, embed):
            for word in words:
                self.assertListEqual(
                    embed(torch.LongTensor([vocab.to_index(word)
                                            ])).tolist()[0],
                    embed(torch.LongTensor([1])).tolist()[0])

        def check_vector_equal(words, vocab, embed, embed_dict, lower=False):
            for word in words:
                index = vocab.to_index(word)
                v1 = embed(torch.LongTensor([index])).tolist()[0]
                if lower:
                    word = word.lower()
                v2 = embed_dict[word]
                for v1i, v2i in zip(v1, v2):
                    self.assertAlmostEqual(v1i, v2i, places=4)

        embed_dict = read_static_embed(
            'test/data_for_tests/embedding/small_static_embedding/'
            'glove.6B.50d_test.txt')

        # 测试是否只使用pretrain的word
        vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile'])
        vocab.add_word('of', no_create_entry=True)
        embed = StaticEmbedding(
            vocab,
            model_dir_or_name=
            'test/data_for_tests/embedding/small_static_embedding/'
            'glove.6B.50d_test.txt',
            only_use_pretrain_word=True)
        #   notinfile应该被置为unk
        check_vector_equal(['the', 'a', 'of'], vocab, embed, embed_dict)
        check_word_unk(['notinfile'], vocab, embed)

        # 测试在大小写情况下的使用
        vocab = Vocabulary().add_word_lst(['The', 'a', 'notinfile'])
        vocab.add_word('Of', no_create_entry=True)
        embed = StaticEmbedding(
            vocab,
            model_dir_or_name=
            'test/data_for_tests/embedding/small_static_embedding/'
            'glove.6B.50d_test.txt',
            only_use_pretrain_word=True)
        check_word_unk(['The', 'Of', 'notinfile'], vocab, embed)  # 这些词应该找不到
        check_vector_equal(['a'], vocab, embed, embed_dict)

        embed = StaticEmbedding(
            vocab,
            model_dir_or_name=
            'test/data_for_tests/embedding/small_static_embedding/'
            'glove.6B.50d_test.txt',
            only_use_pretrain_word=True,
            lower=True)
        check_vector_equal(['The', 'Of', 'a'],
                           vocab,
                           embed,
                           embed_dict,
                           lower=True)
        check_word_unk(['notinfile'], vocab, embed)

        # 测试min_freq
        vocab = Vocabulary().add_word_lst(
            ['The', 'a', 'notinfile1', 'A', 'notinfile2', 'notinfile2'])
        vocab.add_word('Of', no_create_entry=True)

        embed = StaticEmbedding(
            vocab,
            model_dir_or_name=
            'test/data_for_tests/embedding/small_static_embedding/'
            'glove.6B.50d_test.txt',
            only_use_pretrain_word=True,
            lower=True,
            min_freq=2,
            only_train_min_freq=True)

        check_vector_equal(['Of', 'a'], vocab, embed, embed_dict, lower=True)
        check_word_unk(['notinfile1', 'The', 'notinfile2'], vocab, embed)
Exemplo n.º 11
0
class TextData():
    vocab_size = 0
    dataset_size = 0
    train_size = 0
    test_size = 0
    class_num = 4
    min_count = 10
    max_seq_len = 500
    seq_limit = 2000
    data_src = "20news"

    data_set = DataSet()
    train_set = DataSet()
    test_set = DataSet()
    dev_set = DataSet()
    vocab = None


    def __init__(self,data_src="20news",min_count=10,seq_limit=None):
        self.data_src = data_src
        self.min_count = min_count
        if seq_limit is not None:
            self.seq_limit = seq_limit

    def find_max_len(self,words):
        self.max_seq_len = max(len(words),self.max_seq_len)

    def seq_regularize(self,words):
        wlen = len(words)
        if wlen<self.max_seq_len:
            return [0]*(self.max_seq_len-wlen) + words
        else:
            return words[:self.max_seq_len]

    def fetch_20news(self,size=4):
        print("Loading 20newsgroups data and tokenize.")
        if size==20:
            train,test = get_all_20news()
        else:
            train,test = get_text_classification_datasets()
        train_input,test_input = tokenize(train.data,test.data)
        train_target = train.target
        test_target = test.target
        self.class_num = len(train.target_names)
        assert (self.class_num == len(test.target_names))

        # Building Fastnlp dataset.
        print("Building Fastnlp dataset.")
        self.train_set = DataSet({"text":train_input,"class":train_target})
        self.test_set = DataSet({"text":test_input,"class":test_target})
        
        # Building Fastnlp vocabulary...
        print("Building Fastnlp vocabulary.")
        self.vocab = Vocabulary(min_freq=self.min_count)
        self.train_set.apply(lambda x : [self.vocab.add_word(word) for word in x['text']])
        self.vocab.build_vocab()
        self.vocab.build_reverse_vocab()
        self.vocab_size = len(self.vocab)
        # Building multi-hot-vector for train_set and test_set.
        print("Building id-presentation for train_set and test_set.")
        self.vocab.index_dataset(self.train_set,self.test_set,field_name='text',new_field_name='words')
        
        self.train_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len')
        self.test_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len')
        self.train_set.apply_field(self.find_max_len,field_name='words')

        print(self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len,self.seq_limit)

        self.train_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words')
        self.test_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words')
        # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input")
        # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input')
        
        # Building target-vector for train_set and test_set.
        print("Building target-vector for train_set and test_set.")
        self.train_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True)
        self.test_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True)
        # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")
        # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")

    def fetch_csv(self,path=None):
        print("Not implemented now...")
        pass

    def fetch_data(self,path=None):
        if self.data_src == "20news":
            # Loading 20newsgroups data and tokenize.
            self.fetch_20news()
        elif self.data_src == "20news_all":
            self.fetch_20news(size=20)
        else:
            print("No data src...")
        
        self.train_size = self.train_set.get_length()
        self.test_size = self.test_set.get_length()
        return self.train_size,self.test_size
Exemplo n.º 12
0
class TextData():
    data_src = "all_data"
    class_num = 2
    min_count = 10
    max_seq_len = 500
    seq_limit = 2000

    train_set = DataSet()
    val_set = DataSet()
    test_set = DataSet()
    train_size = 0
    val_size = 0
    test_size = 0

    test_projectid = None

    vocab = None
    vocab_size = 0

    def __init__(self, data_src="all_data", min_count=10, seq_limit=None):
        self.data_src = data_src
        self.min_count = min_count
        if seq_limit is not None:
            self.seq_limit = seq_limit

    def find_max_len(self, words):
        self.max_seq_len = max(len(words), self.max_seq_len)

    def seq_regularize(self, words):
        wlen = len(words)
        if wlen < self.max_seq_len:
            return [0] * (self.max_seq_len - wlen) + words
        else:
            return words[:self.max_seq_len]

    def fetch_csv(self,
                  path,
                  text_var="essay",
                  target="is_exciting",
                  subset_num=None,
                  us_rate=None,
                  os_rate=None):
        """ 
        us_rate: under sampling rate
        os_rate: over sampling rate
         """
        print("Loading data from {} ...".format(path))
        df = pd.read_csv(path)
        # text_vars=["title", "short_description", "need_statement", "essay"]
        text_vars = text_var  # only select the essay column
        target_var = "y"
        df[target_var] = 0.0
        df[target_var][df[target] == "t"] = 1.0
        df[target_var][df[target] != "t"] = 0.0
        train_df = df[df['split'] == 'train']
        val_df = df[df['split'] == 'val']
        test_df = df[df['split'] == 'test']
        train_num = len(train_df)
        val_num = len(val_df)
        test_num = len(test_df)
        print("nums:({},{},{})".format(train_num, val_num, test_num))
        if os_rate is not None:
            print("Over Sample mode")
            ros = RandomOverSampler(random_state=0)
        elif us_rate is not None:
            print("Under Sample mode")
            train_df_t = train_df[df[target] == "t"]
            train_df_f = train_df[df[target] == "f"]
            t_num = len(train_df_t)
            f_num = len(train_df_f)
            print("Raw train t:f = {}:{}".format(t_num, f_num))
            nf_num = int(t_num / us_rate)
            f_num = min(nf_num, f_num)
            balanced_train_t = train_df_t.sample(n=t_num)
            balanced_train_f = train_df_f.sample(n=f_num)
            train_df = pd.concat([balanced_train_t,
                                  balanced_train_f]).sample(frac=1)
            print("Balanced train: t:f = {}:{}".format(len(balanced_train_t),
                                                       len(balanced_train_f)))
            # print("Train 1.0:",len(train_df[train_df[target_var] == 1.0]))

            val_df_t = val_df[df[target] == "t"]
            val_df_f = val_df[df[target] == "f"]
            t_num = len(val_df_t)
            f_num = len(val_df_f)
            print("Raw val t:f = {}:{}".format(t_num, f_num))
            nf_num = int(t_num / us_rate)
            f_num = min(nf_num, f_num)
            balanced_val_t = val_df_t.sample(n=t_num)
            balanced_val_f = val_df_f.sample(n=f_num)
            val_df = pd.concat([balanced_val_t, balanced_val_f]).sample(frac=1)
            print("Balanced val: t:f = {}:{}".format(len(balanced_val_t),
                                                     len(balanced_val_f)))
        else:
            print("No sample mode")
        if subset_num is not None and subset_num > 0:
            print("Get sub set of size {}.".format(subset_num))
            train_df = train_df.sample(n=subset_num)
            val_df = val_df.sample(n=subset_num)

        train_num = len(train_df)
        val_num = len(val_df)
        test_num = len(test_df)
        print("subset nums:({},{},{})".format(train_num, val_num, test_num))

        train_target = train_df[target_var].values
        count = 0
        print(count)
        val_target = val_df[target_var].values
        test_target = test_df[target_var].values

        print("tokenize train set")
        train_input = tokenize(train_df[text_vars].values)
        print("tokenize val set")
        val_input = tokenize(val_df[text_vars].values)
        print("tokenize test set")
        test_input = tokenize(test_df[text_vars].values)

        assert (self.class_num == 2)
        self.test_projectid = test_df['projectid']
        # Building Fastnlp dataset.
        print("Building Fastnlp dataset.")
        if os_rate is not None:
            print("Over Sampling...")
            train_input, train_target = ros.fit_sample(
                np.array(train_input)[:, np.newaxis],
                np.array(train_target)[:, np.newaxis])
            train_input = train_input.squeeze().tolist()
            train_target = train_target.tolist()
            val_input, val_target = ros.fit_sample(
                np.array(val_input)[:, np.newaxis],
                np.array(val_target)[:, np.newaxis])
            val_input = val_input.squeeze().tolist()
            val_target = val_target.tolist()
        self.train_set = DataSet({"text": train_input, "class": train_target})
        self.val_set = DataSet({"text": val_input, "class": val_target})
        self.test_set = DataSet({"text": test_input, "class": test_target})

        # Building Fastnlp vocabulary...
        print("Building Fastnlp vocabulary.")
        self.vocab = Vocabulary(min_freq=self.min_count)
        self.train_set.apply(
            lambda x: [self.vocab.add_word(word) for word in x['text']])
        self.vocab.build_vocab()
        self.vocab.build_reverse_vocab()
        self.vocab_size = len(self.vocab)
        # Building multi-hot-vector for train_set and test_set.
        print("Building id-presentation for train_set and test_set.")
        self.vocab.index_dataset(self.train_set,
                                 self.val_set,
                                 self.test_set,
                                 field_name='text',
                                 new_field_name='words')

        self.train_set.apply_field(lambda x: len(x),
                                   field_name='words',
                                   new_field_name='seq_len')
        self.val_set.apply_field(lambda x: len(x),
                                 field_name='words',
                                 new_field_name='seq_len')
        self.test_set.apply_field(lambda x: len(x),
                                  field_name='words',
                                  new_field_name='seq_len')
        self.train_set.apply_field(self.find_max_len, field_name='words')

        print(self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len, self.seq_limit)

        self.train_set.apply_field(self.seq_regularize,
                                   field_name='words',
                                   new_field_name='words')
        self.val_set.apply_field(self.seq_regularize,
                                 field_name='words',
                                 new_field_name='words')
        self.test_set.apply_field(self.seq_regularize,
                                  field_name='words',
                                  new_field_name='words')
        # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input")
        # self.val_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input')
        # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input')

        # Building target-vector for train_set and test_set.
        print("Building target-vector for train_set and test_set.")
        self.train_set.apply(lambda x: int(x['class']),
                             new_field_name="target",
                             is_target=True)
        self.val_set.apply(lambda x: int(x['class']),
                           new_field_name="target",
                           is_target=True)
        self.test_set.apply(lambda x: int(x['class']),
                            new_field_name="target",
                            is_target=True)
        # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")
        # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")

    def fetch_data(self,
                   path,
                   text_var="essay",
                   target_var="is_exciting",
                   subset_num=None,
                   us_rate=None,
                   os_rate=None):
        if self.data_src == "all_data":
            # Loading 20newsgroups data and tokenize.
            self.fetch_csv(path, text_var, target_var, subset_num, us_rate,
                           os_rate)
        else:
            print("No legal data src type:{} ...".format(self.data_src))
            assert (0 == 1)

        self.train_size = self.train_set.get_length()
        self.val_size = self.val_set.get_length()
        self.test_size = self.test_set.get_length()
        return self.train_size, self.val_size, self.test_size