def get_tokenizer(FLAGS, vocab_path, **kargs): if FLAGS.tokenizer == "bert": tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=FLAGS.do_lower_case) elif FLAGS.tokenizer == "jieba_char": tokenizer = tokenization.Jieba_CHAR(config=kargs.get("config", {})) with tf.gfile.Open(vocab_path, "r") as f: lines = f.read().splitlines() vocab_lst = [] for line in lines: vocab_lst.append(line) print(len(vocab_lst)) tokenizer.load_vocab(vocab_lst) return tokenizer
def main(_): tokenizer = tokenization.Jieba_CHAR(config=FLAGS.config) # with tf.gfile.Open(FLAGS.vocab_file, "r") as f: # vocab_lst = [] # for line in f: # vocab_lst.append(line.strip()) vocab_path = os.path.join(FLAGS.buckets, FLAGS.vocab_file) train_file = os.path.join(FLAGS.buckets, FLAGS.train_file) test_file = os.path.join(FLAGS.buckets, FLAGS.test_file) dev_file = os.path.join(FLAGS.buckets, FLAGS.dev_file) train_result_file = os.path.join(FLAGS.buckets, FLAGS.train_result_file) test_result_file = os.path.join(FLAGS.buckets, FLAGS.test_result_file) dev_result_file = os.path.join(FLAGS.buckets, FLAGS.dev_result_file) corpus_vocab_path = os.path.join(FLAGS.buckets, FLAGS.corpus_vocab_path) print(FLAGS.with_char) with tf.gfile.Open(vocab_path, "r") as f: lines = f.read().splitlines() vocab_lst = [] for line in lines: vocab_lst.append(line) print(len(vocab_lst)) tokenizer.load_vocab(vocab_lst) print("==not apply rule==") if FLAGS.data_type == "lcqmc": classifier_data_api = classifier_processor.LCQMCProcessor() classifier_data_api.get_labels(FLAGS.label_id) train_examples = classifier_data_api.get_train_examples(train_file, is_shuffle=True) vocab_filter.vocab_filter(train_examples, vocab_lst, tokenizer, FLAGS.predefined_vocab_size, corpus_vocab_path) tokenizer_corpus = tokenization.Jieba_CHAR(config=FLAGS.config) with tf.gfile.Open(corpus_vocab_path, "r") as f: lines = f.read().splitlines() vocab_lst = [] for line in lines: vocab_lst.append(line) print(len(vocab_lst)) tokenizer_corpus.load_vocab(vocab_lst) write_to_tfrecords.convert_distillation_classifier_examples_to_features( train_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, train_result_file, FLAGS.with_char, FLAGS.char_len) test_examples = classifier_data_api.get_train_examples(test_file, is_shuffle=False) write_to_tfrecords.convert_distillation_classifier_examples_to_features( test_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, test_result_file, FLAGS.with_char, FLAGS.char_len) dev_examples = classifier_data_api.get_train_examples(dev_file, is_shuffle=False) write_to_tfrecords.convert_distillation_classifier_examples_to_features( dev_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, dev_result_file, FLAGS.with_char, FLAGS.char_len)
def main(_): # with tf.gfile.Open(FLAGS.vocab_file, "r") as f: # vocab_lst = [] # for line in f: # vocab_lst.append(line.strip()) vocab_path = os.path.join(FLAGS.buckets, FLAGS.vocab_file) train_file = os.path.join(FLAGS.buckets, FLAGS.train_file) test_file = os.path.join(FLAGS.buckets, FLAGS.test_file) dev_file = os.path.join(FLAGS.buckets, FLAGS.dev_file) train_result_file = os.path.join(FLAGS.buckets, FLAGS.train_result_file) test_result_file = os.path.join(FLAGS.buckets, FLAGS.test_result_file) dev_result_file = os.path.join(FLAGS.buckets, FLAGS.dev_result_file) corpus_vocab_path = os.path.join(FLAGS.buckets, FLAGS.corpus_vocab_path) unsupervised_distillation_file = os.path.join( FLAGS.buckets, FLAGS.unsupervised_distillation_file) supervised_distillation_file = os.path.join( FLAGS.buckets, FLAGS.supervised_distillation_file) if FLAGS.tokenizer_type == "jieba": tokenizer = tokenization.Jieba_CHAR(config=FLAGS.config) elif FLAGS.tokenizer_type == "full_bpe": tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=True if FLAGS.lower_case == "true" else False) if FLAGS.tokenizer_type == "jieba": print(FLAGS.with_char) with tf.gfile.Open(vocab_path, "r") as f: lines = f.read().splitlines() vocab_lst = [] for line in lines: vocab_lst.append(line) print(len(vocab_lst)) tokenizer.load_vocab(vocab_lst) print("==not apply rule==") if FLAGS.distillation_type == "prob": classifier_data_api = classifier_processor.FasttextDistillationProcessor( ) elif FLAGS.distillation_type == "structure": classifier_data_api = classifier_processor.FasttextStructureDistillationProcessor( ) classifier_data_api.get_labels(FLAGS.label_id) train_examples = classifier_data_api.get_supervised_distillation_examples( train_file, supervised_distillation_file, is_shuffle=True) if FLAGS.tokenizer_type == "jieba": vocab_filter.vocab_filter(train_examples, vocab_lst, tokenizer, FLAGS.predefined_vocab_size, corpus_vocab_path) tokenizer_corpus = tokenization.Jieba_CHAR(config=FLAGS.config) with tf.gfile.Open(corpus_vocab_path, "r") as f: lines = f.read().splitlines() vocab_lst = [] for line in lines: vocab_lst.append(line) print(len(vocab_lst)) tokenizer_corpus.load_vocab(vocab_lst) elif FLAGS.tokenizer_type == "full_bpe": tokenizer_corpus = tokenizer dev_examples = classifier_data_api.get_unsupervised_distillation_examples( dev_file, unsupervised_distillation_file, is_shuffle=False) import random if FLAGS.if_add_unlabeled_distillation == "yes": total_train_examples = train_examples + dev_examples else: total_train_examples = train_examples random.shuffle(total_train_examples) if FLAGS.tokenizer_type == "jieba": write_to_tfrecords.convert_distillation_classifier_examples_to_features( total_train_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, train_result_file, FLAGS.with_char, FLAGS.char_len) write_to_tfrecords.convert_distillation_classifier_examples_to_features( dev_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, dev_result_file, FLAGS.with_char, FLAGS.char_len) test_examples = classifier_data_api.get_train_examples( test_file, is_shuffle=False) write_to_tfrecords.convert_distillation_classifier_examples_to_features( test_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, test_result_file, FLAGS.with_char, FLAGS.char_len) elif FLAGS.tokenizer_type == "full_bpe": write_to_tfrecords.convert_bert_distillation_classifier_examples_to_features( total_train_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, train_result_file, FLAGS.with_char, FLAGS.char_len) write_to_tfrecords.convert_bert_distillation_classifier_examples_to_features( dev_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, dev_result_file, FLAGS.with_char, FLAGS.char_len) test_examples = classifier_data_api.get_train_examples( test_file, is_shuffle=False) write_to_tfrecords.convert_bert_distillation_classifier_examples_to_features( test_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, test_result_file, FLAGS.with_char, FLAGS.char_len)
def main(_): # tokenizer = tokenization.Jieba_CHAR( # config=FLAGS.config) # with tf.gfile.Open(FLAGS.vocab_file, "r") as f: # vocab_lst = [] # for line in f: # vocab_lst.append(line.strip()) # vocab_path = os.path.join(FLAGS.buckets, FLAGS.vocab_file) vocab_path = FLAGS.vocab_file train_file = os.path.join(FLAGS.buckets, FLAGS.train_file) test_file = os.path.join(FLAGS.buckets, FLAGS.test_file) dev_file = os.path.join(FLAGS.buckets, FLAGS.dev_file) train_result_file = os.path.join(FLAGS.buckets, FLAGS.train_result_file) test_result_file = os.path.join(FLAGS.buckets, FLAGS.test_result_file) dev_result_file = os.path.join(FLAGS.buckets, FLAGS.dev_result_file) corpus_vocab_path = os.path.join(FLAGS.buckets, FLAGS.corpus_vocab_path) if FLAGS.tokenizer_type == "jieba": tokenizer = tokenization.Jieba_CHAR(config=FLAGS.config) elif FLAGS.tokenizer_type == "full_bpe": tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path, do_lower_case=FLAGS.lower_case) if FLAGS.tokenizer_type == "jieba": print(FLAGS.with_char) with tf.gfile.Open(vocab_path, "r") as f: lines = f.read().splitlines() vocab_lst = [] for line in lines: vocab_lst.append(line) print(len(vocab_lst)) tokenizer.load_vocab(vocab_lst) print("==not apply rule==") if FLAGS.data_type == "fasttext": classifier_data_api = classifier_processor.FasttextClassifierProcessor( ) classifier_data_api.get_labels(FLAGS.label_id) train_examples = classifier_data_api.get_train_examples(train_file, is_shuffle=True) print("==total train examples==", len(train_examples)) test_examples = classifier_data_api.get_train_examples(test_file, is_shuffle=False) print("==total test examples==", len(test_examples)) dev_examples = classifier_data_api.get_train_examples(dev_file, is_shuffle=False) print("==total dev examples==", len(dev_examples)) if FLAGS.tokenizer_type == "jieba": vocab_filter.vocab_filter( train_examples + test_examples + dev_examples, vocab_lst, tokenizer, FLAGS.predefined_vocab_size, corpus_vocab_path) tokenizer_corpus = tokenization.Jieba_CHAR(config=FLAGS.config) with tf.gfile.Open(corpus_vocab_path, "r") as f: lines = f.read().splitlines() vocab_lst = [] for line in lines: vocab_lst.append(line) print(len(vocab_lst)) # print(vocab_lst) tokenizer_corpus.load_vocab(vocab_lst) elif FLAGS.tokenizer_type == "full_bpe": tokenizer_corpus = tokenizer if FLAGS.tokenizer_type == "jieba": write_to_tfrecords.convert_distillation_classifier_examples_to_features( train_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, train_result_file, FLAGS.with_char, FLAGS.char_len) write_to_tfrecords.convert_distillation_classifier_examples_to_features( test_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, test_result_file, FLAGS.with_char, FLAGS.char_len) write_to_tfrecords.convert_distillation_classifier_examples_to_features( dev_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, dev_result_file, FLAGS.with_char, FLAGS.char_len) elif FLAGS.tokenizer_type == "full_bpe": write_to_tfrecords.convert_bert_distillation_classifier_examples_to_features( train_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, train_result_file, FLAGS.with_char, FLAGS.char_len, label_type=FLAGS.label_type) write_to_tfrecords.convert_bert_distillation_classifier_examples_to_features( dev_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, dev_result_file, FLAGS.with_char, FLAGS.char_len, label_type=FLAGS.label_type) test_examples = classifier_data_api.get_train_examples( test_file, is_shuffle=False) write_to_tfrecords.convert_bert_distillation_classifier_examples_to_features( test_examples, classifier_data_api.label2id, FLAGS.max_length, tokenizer_corpus, test_result_file, FLAGS.with_char, FLAGS.char_len, label_type=FLAGS.label_type)