"""Reads a tab separated value file.""" with io.open(input_file, "r", encoding="UTF-8") as file: examples = [] for line in file: data = line.strip().split("_!_") example = InputExample(guid=data[0], label=data[1], text_a=data[3]) examples.append(example) return examples if __name__ == "__main__": from paddlehub.tokenizer.bert_tokenizer import BertTokenizer tokenizer = BertTokenizer(vocab_file='vocab.txt') ds = TNews(tokenizer=tokenizer, max_seq_len=10) print("first 10 dev") for e in ds.get_dev_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("first 10 train") for e in ds.get_train_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("first 10 test") for e in ds.get_test_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print(ds) print("first 10 dev records") for e in ds.get_dev_records()[:10]: print(e)
dev_file="dev.txt", test_file=None, predict_file="test.txt", train_file_with_header=True, dev_file_with_header=True, predict_file_with_header=True, label_list=["0", "1"], tokenizer=tokenizer, max_seq_len=max_seq_len) if __name__ == "__main__": from paddlehub.tokenizer.tokenizer import CustomTokenizer from paddlehub.tokenizer.bert_tokenizer import BertTokenizer tokenizer = BertTokenizer( vocab_file='/mnt/zhangxuefei/.paddlehub/modules/ernie/assets/vocab.txt', tokenize_chinese_chars=False) ds = DuEL(tokenizer=tokenizer, max_seq_len=60) print("first 10 train examples") for e in ds.get_train_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.text_c, e.label)) print("first 10 train records") for e in ds.get_train_records()[:10]: print(e) print("first 10 test records") for e in ds.get_test_records()[:10]: print(e) print("first 10 predict records") for e in ds.get_predict_records()[:10]: print(e)