예제 #1
0
 def load(self, path: str, bigram: bool = False) -> DataSet:
     """
     :param path: str
     :param bigram: 是否使用bigram feature
     :return:
     """
     dataset = DataSet()
     with open(path, 'r', encoding='utf-8') as f:
         for line in f:
             line = line.strip()
             if not line:  # 去掉空行
                 continue
             parts = line.split()
             word_lens = map(len, parts)
             chars = list(''.join(parts))
             tags = self._word_len_to_target(word_lens)
             assert len(chars) == len(tags['target'])
             dataset.append(
                 Instance(raw_chars=chars, **tags, seq_len=len(chars)))
     if len(dataset) == 0:
         raise RuntimeError(f"{path} has no valid data.")
     if bigram:
         dataset.apply_field(self._gen_bigram,
                             field_name='raw_chars',
                             new_field_name='bigrams')
     return dataset
예제 #2
0
def make_dataset(data):
    dataset = DataSet()
    mx = 0
    le = None
    for x, y in zip(data.data, data.target):
        xx = deal(x)
        ins = Instance(sentence=xx, label=int(y))
        if mx < len(xx.split()):
            mx = max(mx, len(xx.split()))
            le = xx
        dataset.append(ins)
    print(mx)
    dataset.apply_field(lambda x: x.split(),
                        field_name='sentence',
                        new_field_name='words')
    dataset.apply_field(lambda x: len(x),
                        field_name='words',
                        new_field_name='seq_len')

    dataset.rename_field('words', Const.INPUT)
    dataset.rename_field('seq_len', Const.INPUT_LEN)
    dataset.rename_field('label', Const.TARGET)

    dataset.set_input(Const.INPUT, Const.INPUT_LEN)
    dataset.set_target(Const.TARGET)
    return dataset
예제 #3
0
    def test_apply_tqdm(self):
        import time
        ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})

        def do_nothing(ins):
            time.sleep(0.01)

        ds.apply(do_nothing, use_tqdm=True)
        ds.apply_field(do_nothing, field_name='x', use_tqdm=True)
예제 #4
0
    def test_copy_padder(self):
        from fastNLP.core.field import AutoPadder
        ds = DataSet()
        ds.add_field('idx', [1, 2, 3])
        ds['idx'].set_padder(None)  # workaround of problem 1
        ds.apply_field(lambda x: x, 'idx', 'idx')
        self.assertEqual(ds['idx'].padder,
                         None)  # should be None, but AutoPadder

        ds = DataSet()
        ds.add_field('idx', [1, 2, 3])
        ds.apply_field(lambda x: x, 'idx', 'idx')
        self.assertTrue(
            isinstance(ds.get_field('idx').padder,
                       AutoPadder))  # should be None, but AutoPadder
예제 #5
0
 def prepare_nli_data(self):
     index = 'index'
     ds = DataSet({index: list(range(N_SAMPLES))})
     ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE),
                    field_name=index, new_field_name=C.INPUTS(0),
                    is_input=True)
     ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE),
                    field_name=index, new_field_name=C.INPUTS(1),
                    is_input=True)
     ds.apply_field(lambda x: randrange(NUM_CLS),
                    field_name=index, new_field_name=C.TARGET,
                    is_target=True)
     ds.apply_field(len, C.INPUTS(0), C.INPUT_LENS(0),
                    is_input=True, is_target=True)
     ds.apply_field(len, C.INPUTS(1), C.INPUT_LENS(1),
                    is_input = True, is_target = True)
     ds.set_input(C.INPUTS(0), C.INPUTS(1))
     ds.set_target(C.TARGET)
     return ds
예제 #6
0
def construct_dataset(dataset):
    dataset_ = DataSet()
    for sentence, target in zip(dataset.data, dataset.target):
        instance = Instance()
        instance['raw_sentence'] = sentence
        instance['target'] = int(target)
        dataset_.append(instance)

    dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '',
                                    x['raw_sentence']),
                   new_field_name='sentence')  #忽略标点
    dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.whitespace), ' ',
                                    x['sentence']),
                   new_field_name='sentence')  #将空格、换行符等空白替换为空格
    dataset_.apply(lambda x: x['sentence'].lower(),
                   new_field_name='sentence')  #转换为小写
    dataset_.apply_field(lambda x: x.split(),
                         field_name='sentence',
                         new_field_name='input')
    return dataset_
예제 #7
0
def read_file(filename, processing_word=get_processing_word(lowercase=False)):
    dataset = DataSet()
    niter = 0
    with codecs.open(filename, "r", "utf-16") as f:
        words, tags = [], []
        for line in f:
            line = line.strip()
            if len(line) == 0 or line.startswith("-DOCSTART-"):
                if len(words) != 0:
                    assert len(words) > 2
                    if niter == 1:
                        print(words, tags)
                    niter += 1
                    dataset.append(
                        Instance(ori_words=words[:-1], ori_tags=tags[:-1]))
                    words, tags = [], []
            else:
                word, tag = line.split()
                word = processing_word(word)
                words.append(word)
                tags.append(tag.lower())

    dataset.apply_field(lambda x: [x[0]],
                        field_name='ori_words',
                        new_field_name='task')
    dataset.apply_field(lambda x: len(x),
                        field_name='ori_tags',
                        new_field_name='seq_len')
    dataset.apply_field(lambda x: expand(x),
                        field_name='ori_words',
                        new_field_name="bi1")
    return dataset
예제 #8
0
 def prepare_pos_tagging_data(self):
     index = 'index'
     ds = DataSet({index: list(range(N_SAMPLES))})
     ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE),
                    field_name=index, new_field_name=C.INPUT,
                    is_input=True)
     ds.apply_field(lambda x: self.gen_seq(len(x), NUM_CLS),
                    field_name=C.INPUT, new_field_name=C.TARGET,
                    is_target=True)
     ds.apply_field(len, C.INPUT, C.INPUT_LEN,
                    is_input=True, is_target=True)
     return ds
예제 #9
0
def get_data():
    dataset_train, dataset_test = get_text_classification_datasets()
    # print(dataset_train.data)

    dic_train = {
        "input" : dataset_train.data,
        "target" : dataset_train.target
    }
    dic_test = {
        "input" : dataset_test.data,
        "target" : dataset_test.target
    }

    dataset = DataSet(dic_train)
    test_data = DataSet(dic_test)

    dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input')
    dataset.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input')
    dataset.apply_field(lambda x: x.split(), field_name='input', new_field_name='words')

    test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.punctuation), "", x.lower()), field_name='input', new_field_name='input')
    test_data.apply_field(lambda x: re.sub(r'[{}]+'.format(string.whitespace), " ", x), field_name='input', new_field_name='input')
    test_data.apply_field(lambda x: x.split(), field_name='input', new_field_name='words')


    # **************************
    dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')
    test_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')
    dataset.rename_field('words', Const.INPUT)
    dataset.rename_field('seq_len', Const.INPUT_LEN)
    dataset.rename_field('target', Const.TARGET)
    
    test_data.rename_field('words', Const.INPUT)
    test_data.rename_field('seq_len', Const.INPUT_LEN)
    test_data.rename_field('target', Const.TARGET)

    # dataset.set_input(Const.INPUT, Const.INPUT_LEN)
    dataset.set_input(Const.INPUT)
    dataset.set_target(Const.TARGET)

    # test_data.set_input(Const.INPUT, Const.INPUT_LEN)
    test_data.set_input(Const.INPUT)
    test_data.set_target(Const.TARGET)
    # **************************

    # only use train for vocab or train+dev
    train_data, dev_data = dataset.split(0.1)
    # print(len(train_data), len(dev_data), len(test_data))
    # print(train_data[0])

    vocab = Vocabulary(min_freq=10).from_dataset(train_data, field_name=Const.INPUT)

    vocab.index_dataset(train_data, field_name=Const.INPUT,new_field_name=Const.INPUT)
    vocab.index_dataset(dev_data, field_name=Const.INPUT,new_field_name=Const.INPUT)
    vocab.index_dataset(test_data, field_name=Const.INPUT,new_field_name=Const.INPUT)

    # print(test_data[0])
    print(len(vocab))
    return vocab, train_data, dev_data, test_data
예제 #10
0
# Prepare the dataset and testset
fitlog.commit(__file__)
fitlog.add_hyper_in_file(__file__)

table = str.maketrans('', '', string.punctuation)
newsgroups_train = fetch_20newsgroups(subset='train')
dataset = DataSet()
for i in range(newsgroups_train.target.shape[0]):
    dataset.append(
        Instance(raw_sentence=newsgroups_train.data[i].replace('\n', ' '),
                 target=int(newsgroups_train.target[i])))
dataset.apply(lambda x: x['raw_sentence'].lower().translate(table),
              new_field_name='sentence')
dataset.apply_field(lambda x: x.split(),
                    field_name='sentence',
                    new_field_name='words')
dataset.apply_field(lambda x: len(x),
                    field_name='words',
                    new_field_name='seq_len')

newsgroups_test = fetch_20newsgroups(subset='test')
testset = DataSet()
for i in range(newsgroups_test.target.shape[0]):
    testset.append(
        Instance(raw_sentence=newsgroups_test.data[i].replace('\n', ' '),
                 target=int(newsgroups_test.target[i])))
testset.apply(lambda x: x['raw_sentence'].lower().translate(table),
              new_field_name='sentence')
testset.apply_field(lambda x: x.split(),
                    field_name='sentence',
예제 #11
0
    def test_tutorial_1_data_preprocess(self):
        from fastNLP import DataSet
        data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."],
                'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'],
                          ['Third', 'instance', '.']],
                'seq_len': [6, 3, 3]}
        dataset = DataSet(data)
        # 传入的dict的每个key的value应该为具有相同长度的list

        from fastNLP import DataSet
        from fastNLP import Instance
        dataset = DataSet()
        instance = Instance(raw_words="This is the first instance",
                            words=['this', 'is', 'the', 'first', 'instance', '.'],
                            seq_len=6)
        dataset.append(instance)

        from fastNLP import DataSet
        from fastNLP import Instance
        dataset = DataSet([
            Instance(raw_words="This is the first instance",
                     words=['this', 'is', 'the', 'first', 'instance', '.'],
                     seq_len=6),
            Instance(raw_words="Second instance .",
                     words=['Second', 'instance', '.'],
                     seq_len=3)
        ])

        from fastNLP import DataSet
        dataset = DataSet({'a': range(-5, 5), 'c': [0] * 10})

        # 不改变dataset,生成一个删除了满足条件的instance的新 DataSet
        dropped_dataset = dataset.drop(lambda ins: ins['a'] < 0, inplace=False)
        # 在dataset中删除满足条件的instance
        dataset.drop(lambda ins: ins['a'] < 0)
        #  删除第3个instance
        dataset.delete_instance(2)
        #  删除名为'a'的field
        dataset.delete_field('a')

        #  检查是否存在名为'a'的field
        print(dataset.has_field('a'))  # 或 ('a' in dataset)
        #  将名为'a'的field改名为'b'
        dataset.rename_field('c', 'b')
        #  DataSet的长度
        len(dataset)

        from fastNLP import DataSet
        data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."]}
        dataset = DataSet(data)

        # 将句子分成单词形式, 详见DataSet.apply()方法
        dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')

        # 或使用DataSet.apply_field()
        dataset.apply_field(lambda sent: sent.split(), field_name='raw_words', new_field_name='words')

        # 除了匿名函数,也可以定义函数传递进去
        def get_words(instance):
            sentence = instance['raw_words']
            words = sentence.split()
            return words

        dataset.apply(get_words, new_field_name='words')
예제 #12
0
class TextData():
    vocab_size = 0
    dataset_size = 0
    train_size = 0
    test_size = 0
    class_num = 4
    min_count = 10
    max_seq_len = 500
    seq_limit = 2000
    data_src = "20news"

    data_set = DataSet()
    train_set = DataSet()
    test_set = DataSet()
    dev_set = DataSet()
    vocab = None


    def __init__(self,data_src="20news",min_count=10,seq_limit=None):
        self.data_src = data_src
        self.min_count = min_count
        if seq_limit is not None:
            self.seq_limit = seq_limit

    def find_max_len(self,words):
        self.max_seq_len = max(len(words),self.max_seq_len)

    def seq_regularize(self,words):
        wlen = len(words)
        if wlen<self.max_seq_len:
            return [0]*(self.max_seq_len-wlen) + words
        else:
            return words[:self.max_seq_len]

    def fetch_20news(self,size=4):
        print("Loading 20newsgroups data and tokenize.")
        if size==20:
            train,test = get_all_20news()
        else:
            train,test = get_text_classification_datasets()
        train_input,test_input = tokenize(train.data,test.data)
        train_target = train.target
        test_target = test.target
        self.class_num = len(train.target_names)
        assert (self.class_num == len(test.target_names))

        # Building Fastnlp dataset.
        print("Building Fastnlp dataset.")
        self.train_set = DataSet({"text":train_input,"class":train_target})
        self.test_set = DataSet({"text":test_input,"class":test_target})
        
        # Building Fastnlp vocabulary...
        print("Building Fastnlp vocabulary.")
        self.vocab = Vocabulary(min_freq=self.min_count)
        self.train_set.apply(lambda x : [self.vocab.add_word(word) for word in x['text']])
        self.vocab.build_vocab()
        self.vocab.build_reverse_vocab()
        self.vocab_size = len(self.vocab)
        # Building multi-hot-vector for train_set and test_set.
        print("Building id-presentation for train_set and test_set.")
        self.vocab.index_dataset(self.train_set,self.test_set,field_name='text',new_field_name='words')
        
        self.train_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len')
        self.test_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len')
        self.train_set.apply_field(self.find_max_len,field_name='words')

        print(self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len,self.seq_limit)

        self.train_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words')
        self.test_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words')
        # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input")
        # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input')
        
        # Building target-vector for train_set and test_set.
        print("Building target-vector for train_set and test_set.")
        self.train_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True)
        self.test_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True)
        # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")
        # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")

    def fetch_csv(self,path=None):
        print("Not implemented now...")
        pass

    def fetch_data(self,path=None):
        if self.data_src == "20news":
            # Loading 20newsgroups data and tokenize.
            self.fetch_20news()
        elif self.data_src == "20news_all":
            self.fetch_20news(size=20)
        else:
            print("No data src...")
        
        self.train_size = self.train_set.get_length()
        self.test_size = self.test_set.get_length()
        return self.train_size,self.test_size
예제 #13
0
def main():
    parser = argparse.ArgumentParser()
    # fmt: off
    parser.add_argument("--data_path", required=True, type=str, help="all of datasets pkl paths")
    # fmt: on

    options, _ = parser.parse_known_args()

    train_set, test_set = DataSet(), DataSet()

    input_dir = os.path.join(options.data_path, "joint-sighan2008/bmes")
    options.output = os.path.join(options.data_path, "total_dataset.pkl")
    print(input_dir, options.output)

    for fn in os.listdir(input_dir):
        if fn not in ["test.txt", "train-all.txt"]:
            continue
        print(fn)
        abs_fn = os.path.join(input_dir, fn)
        ds = read_file(abs_fn)
        if "test.txt" == fn:
            test_set = ds
        else:
            train_set = ds

    print(
        "num samples of total train, test: {}, {}".format(len(train_set), len(test_set))
    )

    uni_vocab = Vocabulary(min_freq=None).from_dataset(
        train_set, test_set, field_name="ori_words"
    )
    # bi_vocab = Vocabulary(min_freq=3, max_size=50000).from_dataset(train_set,test_set, field_name="bi1")
    bi_vocab = Vocabulary(min_freq=3, max_size=None).from_dataset(
        train_set, field_name="bi1", no_create_entry_dataset=[test_set]
    )
    tag_vocab = Vocabulary(min_freq=None, padding="s", unknown=None).from_dataset(
        train_set, field_name="ori_tags"
    )
    task_vocab = Vocabulary(min_freq=None, padding=None, unknown=None).from_dataset(
        train_set, field_name="task"
    )

    def to_index(dataset):
        uni_vocab.index_dataset(dataset, field_name="ori_words", new_field_name="uni")
        tag_vocab.index_dataset(dataset, field_name="ori_tags", new_field_name="tags")
        task_vocab.index_dataset(dataset, field_name="task", new_field_name="task")

        dataset.apply_field(lambda x: x[1:], field_name="bi1", new_field_name="bi2")
        dataset.apply_field(lambda x: x[:-1], field_name="bi1", new_field_name="bi1")
        bi_vocab.index_dataset(dataset, field_name="bi1", new_field_name="bi1")
        bi_vocab.index_dataset(dataset, field_name="bi2", new_field_name="bi2")

        dataset.set_input("task", "uni", "bi1", "bi2", "seq_len")
        dataset.set_target("tags")
        return dataset

    train_set = to_index(train_set)
    test_set = to_index(test_set)

    output = {}
    output["train_set"] = train_set
    output["test_set"] = test_set
    output["uni_vocab"] = uni_vocab
    output["bi_vocab"] = bi_vocab
    output["tag_vocab"] = tag_vocab
    output["task_vocab"] = task_vocab

    print(tag_vocab.word2idx)
    print(task_vocab.word2idx)

    make_sure_path_exists(os.path.dirname(options.output))

    print("Saving dataset to {}".format(os.path.abspath(options.output)))
    with open(options.output, "wb") as outfile:
        dump(output, outfile)

    print(len(task_vocab), len(tag_vocab), len(uni_vocab), len(bi_vocab))
    dic = {}
    tokens = {}

    def process(words):
        name = words[0][1:-1]
        if name not in dic:
            dic[name] = set()
            tokens[name] = 0
        tokens[name] += len(words[1:])
        dic[name].update(words[1:])

    train_set.apply_field(process, "ori_words", None)
    for name in dic.keys():
        print(name, len(dic[name]), tokens[name])

    with open(os.path.join(os.path.dirname(options.output), "oovdict.pkl"), "wb") as f:
        dump(dic, f)

    def get_max_len(ds):
        global max_len
        max_len = 0

        def find_max_len(words):
            global max_len
            if max_len < len(words):
                max_len = len(words)

        ds.apply_field(find_max_len, "ori_words", None)
        return max_len

    print(
        "train max len: {}, test max len: {}".format(
            get_max_len(train_set), get_max_len(test_set)
        )
    )
예제 #14
0
class TextData():
    data_src = "all_data"
    class_num = 2
    min_count = 10
    max_seq_len = 500
    seq_limit = 2000

    train_set = DataSet()
    val_set = DataSet()
    test_set = DataSet()
    train_size = 0
    val_size = 0
    test_size = 0

    test_projectid = None

    vocab = None
    vocab_size = 0

    def __init__(self, data_src="all_data", min_count=10, seq_limit=None):
        self.data_src = data_src
        self.min_count = min_count
        if seq_limit is not None:
            self.seq_limit = seq_limit

    def find_max_len(self, words):
        self.max_seq_len = max(len(words), self.max_seq_len)

    def seq_regularize(self, words):
        wlen = len(words)
        if wlen < self.max_seq_len:
            return [0] * (self.max_seq_len - wlen) + words
        else:
            return words[:self.max_seq_len]

    def fetch_csv(self,
                  path,
                  text_var="essay",
                  target="is_exciting",
                  subset_num=None,
                  us_rate=None,
                  os_rate=None):
        """ 
        us_rate: under sampling rate
        os_rate: over sampling rate
         """
        print("Loading data from {} ...".format(path))
        df = pd.read_csv(path)
        # text_vars=["title", "short_description", "need_statement", "essay"]
        text_vars = text_var  # only select the essay column
        target_var = "y"
        df[target_var] = 0.0
        df[target_var][df[target] == "t"] = 1.0
        df[target_var][df[target] != "t"] = 0.0
        train_df = df[df['split'] == 'train']
        val_df = df[df['split'] == 'val']
        test_df = df[df['split'] == 'test']
        train_num = len(train_df)
        val_num = len(val_df)
        test_num = len(test_df)
        print("nums:({},{},{})".format(train_num, val_num, test_num))
        if os_rate is not None:
            print("Over Sample mode")
            ros = RandomOverSampler(random_state=0)
        elif us_rate is not None:
            print("Under Sample mode")
            train_df_t = train_df[df[target] == "t"]
            train_df_f = train_df[df[target] == "f"]
            t_num = len(train_df_t)
            f_num = len(train_df_f)
            print("Raw train t:f = {}:{}".format(t_num, f_num))
            nf_num = int(t_num / us_rate)
            f_num = min(nf_num, f_num)
            balanced_train_t = train_df_t.sample(n=t_num)
            balanced_train_f = train_df_f.sample(n=f_num)
            train_df = pd.concat([balanced_train_t,
                                  balanced_train_f]).sample(frac=1)
            print("Balanced train: t:f = {}:{}".format(len(balanced_train_t),
                                                       len(balanced_train_f)))
            # print("Train 1.0:",len(train_df[train_df[target_var] == 1.0]))

            val_df_t = val_df[df[target] == "t"]
            val_df_f = val_df[df[target] == "f"]
            t_num = len(val_df_t)
            f_num = len(val_df_f)
            print("Raw val t:f = {}:{}".format(t_num, f_num))
            nf_num = int(t_num / us_rate)
            f_num = min(nf_num, f_num)
            balanced_val_t = val_df_t.sample(n=t_num)
            balanced_val_f = val_df_f.sample(n=f_num)
            val_df = pd.concat([balanced_val_t, balanced_val_f]).sample(frac=1)
            print("Balanced val: t:f = {}:{}".format(len(balanced_val_t),
                                                     len(balanced_val_f)))
        else:
            print("No sample mode")
        if subset_num is not None and subset_num > 0:
            print("Get sub set of size {}.".format(subset_num))
            train_df = train_df.sample(n=subset_num)
            val_df = val_df.sample(n=subset_num)

        train_num = len(train_df)
        val_num = len(val_df)
        test_num = len(test_df)
        print("subset nums:({},{},{})".format(train_num, val_num, test_num))

        train_target = train_df[target_var].values
        count = 0
        print(count)
        val_target = val_df[target_var].values
        test_target = test_df[target_var].values

        print("tokenize train set")
        train_input = tokenize(train_df[text_vars].values)
        print("tokenize val set")
        val_input = tokenize(val_df[text_vars].values)
        print("tokenize test set")
        test_input = tokenize(test_df[text_vars].values)

        assert (self.class_num == 2)
        self.test_projectid = test_df['projectid']
        # Building Fastnlp dataset.
        print("Building Fastnlp dataset.")
        if os_rate is not None:
            print("Over Sampling...")
            train_input, train_target = ros.fit_sample(
                np.array(train_input)[:, np.newaxis],
                np.array(train_target)[:, np.newaxis])
            train_input = train_input.squeeze().tolist()
            train_target = train_target.tolist()
            val_input, val_target = ros.fit_sample(
                np.array(val_input)[:, np.newaxis],
                np.array(val_target)[:, np.newaxis])
            val_input = val_input.squeeze().tolist()
            val_target = val_target.tolist()
        self.train_set = DataSet({"text": train_input, "class": train_target})
        self.val_set = DataSet({"text": val_input, "class": val_target})
        self.test_set = DataSet({"text": test_input, "class": test_target})

        # Building Fastnlp vocabulary...
        print("Building Fastnlp vocabulary.")
        self.vocab = Vocabulary(min_freq=self.min_count)
        self.train_set.apply(
            lambda x: [self.vocab.add_word(word) for word in x['text']])
        self.vocab.build_vocab()
        self.vocab.build_reverse_vocab()
        self.vocab_size = len(self.vocab)
        # Building multi-hot-vector for train_set and test_set.
        print("Building id-presentation for train_set and test_set.")
        self.vocab.index_dataset(self.train_set,
                                 self.val_set,
                                 self.test_set,
                                 field_name='text',
                                 new_field_name='words')

        self.train_set.apply_field(lambda x: len(x),
                                   field_name='words',
                                   new_field_name='seq_len')
        self.val_set.apply_field(lambda x: len(x),
                                 field_name='words',
                                 new_field_name='seq_len')
        self.test_set.apply_field(lambda x: len(x),
                                  field_name='words',
                                  new_field_name='seq_len')
        self.train_set.apply_field(self.find_max_len, field_name='words')

        print(self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len, self.seq_limit)

        self.train_set.apply_field(self.seq_regularize,
                                   field_name='words',
                                   new_field_name='words')
        self.val_set.apply_field(self.seq_regularize,
                                 field_name='words',
                                 new_field_name='words')
        self.test_set.apply_field(self.seq_regularize,
                                  field_name='words',
                                  new_field_name='words')
        # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input")
        # self.val_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input')
        # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input')

        # Building target-vector for train_set and test_set.
        print("Building target-vector for train_set and test_set.")
        self.train_set.apply(lambda x: int(x['class']),
                             new_field_name="target",
                             is_target=True)
        self.val_set.apply(lambda x: int(x['class']),
                           new_field_name="target",
                           is_target=True)
        self.test_set.apply(lambda x: int(x['class']),
                            new_field_name="target",
                            is_target=True)
        # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")
        # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")

    def fetch_data(self,
                   path,
                   text_var="essay",
                   target_var="is_exciting",
                   subset_num=None,
                   us_rate=None,
                   os_rate=None):
        if self.data_src == "all_data":
            # Loading 20newsgroups data and tokenize.
            self.fetch_csv(path, text_var, target_var, subset_num, us_rate,
                           os_rate)
        else:
            print("No legal data src type:{} ...".format(self.data_src))
            assert (0 == 1)

        self.train_size = self.train_set.get_length()
        self.val_size = self.val_set.get_length()
        self.test_size = self.test_set.get_length()
        return self.train_size, self.val_size, self.test_size
예제 #15
0
def preprocessing(data_train, data_test):
    data_train_dict = {'raw_text': data_train.data,
                        'label': data_train.target}
    data_test_dict = {'raw_text': data_test.data,
                        'label': data_test.target}
    dataset = DataSet(data_train_dict)
    test_set = DataSet(data_test_dict)
    dataset.apply_field(lambda piece: re.sub('[' + string.whitespace + '\u200b]+', ' ', 
                        re.sub('[' + string.punctuation +']', '', piece)).strip().lower(), 
                        field_name='raw_text', new_field_name='raw_text')
    test_set.apply_field(lambda piece: re.sub('[' + string.whitespace + '\u200b]+', ' ', 
                        re.sub('[' + string.punctuation + ']', '', piece)).strip().lower(), 
                        field_name='raw_text', new_field_name='raw_text')
    dataset.apply_field(lambda piece: piece.split(' '), 
                        field_name='raw_text', new_field_name='text')
    test_set.apply_field(lambda piece: piece.split(' '), 
                        field_name='raw_text', new_field_name='text')

    # 观察数据集中文本长度分布,以选取合适的text_length
    # data_lens = []
    # for instance in dataset:
    #     data_lens.append(len(instance['text']))
    # for instance in test_set:
    #     data_lens.append(len(instance['text']))
    # print("max text_len %d, min text_len %d" % (max(data_lens), min(data_lens)))
    # print(len([i for i in data_lens if i < 400]))
    # plt.hist(data_lens, bins=200, facecolor="blue", edgecolor="black", alpha=0.7)
    # plt.xlabel("text_length")
    # plt.ylabel("number of texts")
    # plt.title("Distribution of text_length")
    # plt.show()

    dataset.apply_field(lambda piece: piece[:text_len], 
                        field_name='text', new_field_name='text')
    test_set.apply_field(lambda piece: piece[:text_len], 
                        field_name='text', new_field_name='text')
    
    dataset.delete_field('raw_text')
    test_set.delete_field('raw_text')

    # 将数字都转换成相同的形式
    # for instance in dataset:
    #     for i, word in enumerate(instance['text']):
    #         if word.isdigit():
    #             instance['text'][i] = '1'
    # for instance in test_set:
    #     for i, word in enumerate(instance['text']):
    #         if word.isdigit():
    #             instance['text'][i] = '1'

    vocab = Vocabulary(min_freq=min_freqency, unknown='<unk>', padding='<pad>').from_dataset(dataset, field_name='text')
    print("vocabulary_length:", len(vocab))
    vocab.index_dataset(dataset, field_name='text',new_field_name='text')
    vocab.index_dataset(test_set, field_name='text',new_field_name='text')

    # 是否使用padding, 将每条文本变为等长

    train_set, dev_set = dataset.split(0.2)

    train_set.rename_field('text', Const.INPUT)
    train_set.rename_field('label', Const.TARGET)
    train_set.set_input(Const.INPUT)
    train_set.set_target(Const.TARGET)
    dev_set.rename_field('text', Const.INPUT)
    dev_set.rename_field('label', Const.TARGET)
    dev_set.set_input(Const.INPUT)
    dev_set.set_target(Const.TARGET)
    test_set.rename_field('text', Const.INPUT)
    test_set.rename_field('label', Const.TARGET)
    test_set.set_input(Const.INPUT)
    test_set.set_target(Const.TARGET)

    print("train_set length:", len(train_set))
    print("dev_set length:", len(dev_set))
    print("test_set length:", len(test_set))

    return train_set, dev_set, test_set, vocab