def construct_test(root, path, tokenizer_path, MAX_SEQUENCE_LENGTH, test=False, batch_size=1): tokenizer = tokenization.FullTokenizer(tokenizer_path, True) #MAX_SEQUENCE_LENGTH = 512 data = pd.read_csv(root + path) input_categories = list(data.columns[[1, 2, 5]]) inputs = compute_input_arays(data, input_categories, tokenizer, MAX_SEQUENCE_LENGTH) def generator(): for in1, in2, in3 in zip(inputs[0], inputs[1], inputs[2]): yield { 'input_word_ids': in1, 'input_masks': in2, 'input_segments': in3 } #yield [in1, in2, in3], out dataset = tf.data.Dataset.from_generator(generator, \ {'input_word_ids':tf.int32, 'input_masks': tf.int32, 'input_segments':tf.int32}) dataset = dataset.batch(batch_size) print('Test dataset constructed successfully with shape =', data.shape) return dataset
def load_model(self, gpu_id, vocab_file, gpu_memory_fraction, model_path, max_seq_length): os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' os.environ['CUDA_VISIBLE_DEVICES'] = gpu_id self.tokenizer = bert_tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) sess_config = tf.ConfigProto(gpu_options=gpu_options) self.sess = tf.Session(config=sess_config) with gfile.FastGFile(model_path, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) self.sess.graph.as_default() tf.import_graph_def(graph_def, name="") self.sess.run(tf.global_variables_initializer()) self.is_train = self.sess.graph.get_tensor_by_name("input/is_train:0") self.input_ids = self.sess.graph.get_tensor_by_name( "input/input_ids:0") self.input_mask = self.sess.graph.get_tensor_by_name( "input/input_mask:0") self.segment_ids = self.sess.graph.get_tensor_by_name( "input/segment_ids:0") self.predictions = self.sess.graph.get_tensor_by_name( "output_layer/predictions:0") self.max_seq_length = max_seq_length
def __init__(self, is_training): self.is_training = is_training self.tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) print(FLAGS.include_unknowns) print(FLAGS.max_seq_length)
def prepro(config): tokenizer = bert_tokenization.FullTokenizer(vocab_file=config.vocab_file) trainBunch = build_features(config, config.train_file, "train", tokenizer) print("save train bunch") pickle.dump(trainBunch, open(config.train_eval, "wb")) devBunch = build_features(config, config.dev_file, "dev", tokenizer) print("save dev bunch") pickle.dump(devBunch, open(config.dev_eval, "wb"))
def load_model_etc(model_dir): with open(_config_path(model_dir)) as f: config = json.load(f) model = load_model(_model_path(model_dir)) tokenizer = tokenization.FullTokenizer( vocab_file=_vocab_path(model_dir), do_lower_case=config['do_lower_case']) labels = load_labels(_labels_path(model_dir)) return model, tokenizer, labels, config
def load_bert_vocab(self): with tf.Graph().as_default(): bert_model = hub.Module(self.bert_url) vocab_info = bert_model(signature="tokenization_info", as_dict=True) with tf.Session() as sess: vocab_file, do_lower_case = sess.run( [vocab_info["vocab_file"], vocab_info["do_lower_case"]]) return bert_tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
def load_ner_model(ner_model_dir): with open(_ner_config_path(ner_model_dir)) as f: config = json.load(f) model = keras.models.load_model( _ner_model_path(ner_model_dir), custom_objects=get_custom_objects() ) tokenizer = tokenization.FullTokenizer( vocab_file=_ner_vocab_path(ner_model_dir), do_lower_case=config['do_lower_case'] ) labels = read_labels(_ner_labels_path(ner_model_dir)) return model, tokenizer, labels, config
def load_pretrained(options): model = load_trained_model_from_checkpoint( options.bert_config_file, options.init_checkpoint, training=False, trainable=True, seq_len=options.max_seq_length, ) tokenizer = tokenization.FullTokenizer( vocab_file=options.vocab_file, do_lower_case=options.do_lower_case ) return model, tokenizer
def get_tokenizer(self): bert_layer = self.model.get_layer("bert") try: vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() # noqa do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case) # AlBERT uses a different tokenizer. # AttributeError: '_UserObject' object has no attribute 'vocab_file' except AttributeError: sp_model_file = bert_layer.resolved_object.sp_model_file.asset_path.numpy() # noqa tokenizer = bert_tokenization.FullSentencePieceTokenizer( sp_model_file) return tokenizer
def construct(root, path, tokenizer_path, MAX_SEQUENCE_LENGTH, test=False, batch_size=1): tokenizer = tokenization.FullTokenizer(tokenizer_path, True) #MAX_SEQUENCE_LENGTH = 512 data = pd.read_csv(root + path) #df_test = pd.read_csv(PATH+'test.csv') #df_sub = pd.read_csv(PATH+'sample_submission.csv') #print('test shape =', df_test.shape) output_categories = list(data.columns[11:]) input_categories = list(data.columns[[1, 2, 5]]) outputs = compute_output_arrays(data, output_categories) inputs = compute_input_arays(data, input_categories, tokenizer, MAX_SEQUENCE_LENGTH) #test_inputs = compute_input_arays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH) #inputs = [np.expand_dims(x, axis=2) for x in inputs] #outputs = np.expand_dims(outputs, axis=2) if not test: def generator(): for in1, in2, in3, out in zip(inputs[0], inputs[1], inputs[2], outputs): yield { 'input_word_ids': in1, 'input_masks': in2, 'input_segments': in3 }, out #yield [in1, in2, in3], out dataset = tf.data.Dataset.from_generator(generator, \ ({'input_word_ids':tf.int32, 'input_masks': tf.int32, 'input_segments':tf.int32},tf.float32)) dataset = dataset.batch(batch_size) print('Train dataset constructed successfully with shape =', data.shape) else: # evaluation return (inputs, outputs) return dataset
def main(argv): args = argparser().parse_args(argv[1:]) tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case ) label_list = load_labels(args.labels) label_map = { l: i for i, l in enumerate(label_list) } examples = [] for x, y in tsv_generator(args.input_file, tokenizer, label_map, args): examples.append(Example(x, y)) if args.max_examples and len(examples) >= args.max_examples: break write_examples(examples, args.output_file) return 0
def preprocess_(): # Data Preparation # ================================================== # Load data print("Loading data...") tf.reset_default_graph() if BINARY: preprocess.ITALIAN = False x_text, y = preprocess.load_data_and_bin_labels( "./CrisisLexT26_preprocessed/") preprocess.ITALIAN = True x_text_italian, y_italian = preprocess.load_data_and_bin_labels( "./italian_preprocessed/") else: preprocess.ITALIAN = False x_text, y = preprocess.load_data_and_labels( "./CrisisLexT26_preprocessed/") preprocess.ITALIAN = True x_text_italian, y_italian = preprocess.load_data_and_labels( "./italian_preprocessed/") x_english_to_italian = preprocess.load_from_file( "../Data/EnglishData/CrisisLexT26_english_to_italian_output_related.txt" ) x_italian_to_english = preprocess.load_from_file( "../Data/ItalianData/italian_to_english_output_related.txt") max_document_length = max([len(x.split(" ")) for x in x_text]) max_document_length_italian = max( [len(x.split(" ")) for x in x_text_italian]) print("Max Document length:", max_document_length) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) vocab_processor_italian = learn.preprocessing.VocabularyProcessor( max_document_length_italian) if PRETRAINEDEMBEDDING and (main_pre_trained_embeddings.Embedding == "ELMo" or main_pre_trained_embeddings.Embedding == "Bert"): x = x_text x_italian = x_text_italian else: x = np.array(list(vocab_processor.fit_transform(x_text))) x_italian = np.array( list(vocab_processor_italian.fit_transform(x_text_italian))) shuffle_indices = np.random.permutation(np.arange(len(y))) shuffle_indices_italian = np.random.permutation(np.arange(len(y_italian))) if CNN: if PRETRAINEDEMBEDDING == True and ( main_pre_trained_embeddings.Embedding == "ELMo" or main_pre_trained_embeddings.Embedding == "Bert" or main_pre_trained_embeddings.Embedding == 'GloVe' or main_pre_trained_embeddings.Embedding == "fastText"): x_shuffled = x y_shuffled = y x_shuffled_italian = x_italian y_shuffled_italian = y_italian else: x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] x_shuffled_italian = x_italian[shuffle_indices_italian] y_shuffled_italian = y_italian[shuffle_indices_italian] if SVM or NB: x_shuffled = x_text y_shuffled = y x_shuffled_italian = x_text_italian y_shuffled_italian = y_italian # Split train/test set if PRETRAINEDEMBEDDING and main_pre_trained_embeddings.Embedding == "Bert": import bert_tokenization if ITALIAN: tokenizer = bert_tokenization.FullTokenizer( vocab_file="../Data/bert_checkpoint_multilingual/vocab.txt", do_lower_case=True) else: tokenizer = bert_tokenization.FullTokenizer( vocab_file="../Data/bert_checkpoint/vocab.txt", do_lower_case=True) tokenized = [tokenizer.tokenize(j) for i, j in enumerate(x_shuffled)] x_t = [] index = 0 for i in tokenized: x_t.append([]) for j in tokenized: x_t[-1].append(index) index += 1 x_shuffled = x_t # x_ = [[np.array(i)] for i in range(len(x_train))] # SPLIT THE DATASET in 1) labeled training 2) unlabeled 3) validation 4) test percentage = 0.50 dev_sample_index = -750 dev_labeled_index = 3000 dev_unlabeled_index = 7500 x_train, x_unlabeled, x_dev = x_shuffled[:int( dev_labeled_index * percentage )], x_shuffled[dev_labeled_index:dev_unlabeled_index + dev_labeled_index], x_shuffled[dev_sample_index:] y_train, y_unlabeled, y_dev = y_shuffled[:int( dev_labeled_index * percentage )], y_shuffled[dev_labeled_index:dev_unlabeled_index + dev_labeled_index], y_shuffled[dev_sample_index:] italian_dev_sample_index = -250 italian_dev_labeled_index = 1000 italian_dev_unlabeled_index = 3000 x_train_italian, x_unlabeled_italian, x_dev_italian = x_shuffled_italian[:int( italian_dev_labeled_index * percentage)], x_shuffled_italian[ italian_dev_labeled_index:italian_dev_unlabeled_index + italian_dev_labeled_index], x_shuffled_italian[ italian_dev_sample_index:] y_train_italian, y_unlabeled_italian, y_dev_italian = y_shuffled_italian[:int( italian_dev_labeled_index * percentage)], y_shuffled_italian[ italian_dev_labeled_index:italian_dev_unlabeled_index + italian_dev_labeled_index], y_shuffled_italian[ italian_dev_sample_index:] x_english_to_italian = x_english_to_italian[ dev_labeled_index:dev_unlabeled_index + dev_labeled_index] x_italian_to_english = x_italian_to_english[ italian_dev_labeled_index:italian_dev_unlabeled_index + italian_dev_labeled_index] del x, y, x_shuffled, y_shuffled, x_shuffled_italian, y_shuffled_italian print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Unlabeled/Dev split: {:d}/{:d}/{:d}".format( len(y_train), len(y_unlabeled), len(y_dev))) return x_text, x_train, x_unlabeled, x_dev, vocab_processor, y_train, y_unlabeled, y_dev, x_text_italian, x_train_italian, x_unlabeled_italian, x_dev_italian, vocab_processor_italian, y_train_italian, y_unlabeled_italian, y_dev_italian, x_english_to_italian, x_italian_to_english
i += 1 word_weights = np.asarray(values[-d:], dtype=np.float32) # print(len(word_weights)) word2idx[word] = index + 1 weights.append(word_weights) if index + 1 == 400: break embedding_size = len(weights[0]) weights.insert(0, np.random.randn(embedding_size)) UNKNOWN_TOKEN = len(weights) word2idx['UNK'] = UNKNOWN_TOKEN weights.append(np.random.randn(embedding_size)) if Embedding == "Bert": weights = [] tokenizer = bert_tokenization.FullTokenizer( vocab_file="./bert_checkpoint/vocab.txt", do_lower_case=True) tokenized = [tokenizer.tokenize(j) for i, j in enumerate(x_text)] for i, j in enumerate(tokenized): if len(j) > max_seq_length - 2: tokenized[i] = tokenized[i][0:(max_seq_length - 2)] bert_config = [] i = -1 word2idx = {('PAD', 0): -1} j = 0 for line in open('CrisisLexT26_english_output2.txt', 'r'): record = line.split() if record[0] == "[CLS]": i += 1 word2idx[(record[0], i)] = j j += 1 weights.append(record[1:])
def get_tokenizer(options): tokenizer = tokenization.FullTokenizer(vocab_file=options.vocab_file, do_lower_case=options.do_lower_case) return tokenizer
from sklearn.model_selection import GroupKFold import matplotlib.pyplot as plt from tqdm.notebook import tqdm import tensorflow_hub as hub import tensorflow as tf import bert_tokenization as tokenization import tensorflow.keras.backend as K import gc import os from scipy.stats import spearmanr from math import floor, ceil np.set_printoptions(suppress=True) PATH = '../input/google-quest-challenge/' BERT_PATH = '../input/bert-base-from-tfhub/bert_en_uncased_L-12_H-768_A-12' tokenizer = tokenization.FullTokenizer(BERT_PATH + '/assets/vocab.txt', True) MAX_SEQUENCE_LENGTH = 512 df_train = pd.read_csv(PATH + 'train.csv') df_test = pd.read_csv(PATH + 'test.csv') df_sub = pd.read_csv(PATH + 'sample_submission.csv') print('train shape =', df_train.shape) print('test shape =', df_test.shape) output_categories = list(df_train.columns[11:]) input_categories = list(df_train.columns[[1, 2, 5]]) print('\noutput categories:\n\t', output_categories) print('\ninput categories:\n\t', input_categories) def _get_masks(tokens, max_seq_length):
import numpy as np import bert_tokenization from bert_serving.client import BertClient from sentence_encoder import SentenceEncoder BERT_BASE_DIR = 'external/bert/cased_L-24_H-1024_A-16/' tokenizer = bert_tokenization.FullTokenizer(vocab_file=BERT_BASE_DIR + 'vocab.txt', do_lower_case=False) def bert_embed(bc, sents, merge_subtokens=True, merge_strategy='first'): sents_encodings_full = bc.encode(sents) sents_tokenized = [tokenizer.tokenize(s) for s in sents] sents_encodings = [] for sent_tokens, sent_vecs in zip(sents_tokenized, sents_encodings_full): sent_encodings = [] sent_vecs = sent_vecs[1:-1] # ignoring [CLS] and [SEP] for token, vec in zip(sent_tokens, sent_vecs): layers_vecs = np.split(vec, 4) # due to -pooling_layer -4 -3 -2 -1 layers_sum = np.array(layers_vecs, dtype=np.float32).sum(axis=0) sent_encodings.append((token, layers_sum)) sents_encodings.append(sent_encodings) if merge_subtokens: sents_encodings_merged = [] for sent, sent_encodings in zip(sents, sents_encodings): sent_tokens_vecs = []
flags.DEFINE_string( "predict_file", "/kaggle/input/tensorflow2-question-answering/simplified-nq-test.jsonl", "NQ json for predictions. E.g., dev-v1.1.jsonl.gz or test-v1.1.jsonl.gz") flags.DEFINE_boolean("logtostderr", True, "Logs to stderr") flags.DEFINE_boolean("undefok", True, "it's okay to be undefined") flags.DEFINE_string('f', '', 'kernel') flags.DEFINE_string('HistoryManager.hist_file', '', 'kernel') FLAGS = flags.FLAGS bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf2baseline.validate_flags_or_throw(bert_config) tf.io.gfile.makedirs(FLAGS.output_dir) tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.compat.v1.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.compat.v1.estimator.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores,
def main(_): tf.logging.set_verbosity(tf.logging.INFO) layer_indexes = [int(x) for x in FLAGS.layers.split(",")] bert_config = bert_modeling.BertConfig.from_json_file( FLAGS.bert_config_file) tokenizer = bert_tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( master=FLAGS.master, tpu_config=tf.contrib.tpu.TPUConfig( num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) examples = read_examples(FLAGS.input_file) features = convert_examples_to_features(examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, layer_indexes=layer_indexes, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, predict_batch_size=FLAGS.batch_size) input_fn = input_fn_builder(features=features, seq_length=FLAGS.max_seq_length) with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, "w")) as writer: for result in estimator.predict(input_fn, yield_single_examples=True): unique_id = int(result["unique_id"]) feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id all_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): layer_output = result["layer_output_%d" % j] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(float(x), 6) for x in layer_output[i:(i + 1)].flat ] all_layers.append(layers) features = collections.OrderedDict() features["token"] = token features["layers"] = all_layers all_features.append(features) output_json["features"] = all_features writer.write(json.dumps(output_json) + "\n")