def __init__(self, sentence_tokenizer='normal', raw_reviews_path=None): """ Sentence Tokenizers: normal, punkt """ self.sent_tokenize = sent_tokenize if sentence_tokenizer == 'normal': pass elif sentence_tokenizer == 'punkt': self.sent_tokenize = nltk.data.load( 'tokenizers/punkt/english.pickle').tokenize self.tokenizer = utils.get_tokenizer() if utils.is_none(raw_reviews_path): self.raw_reviews = utils.get_raw_test_reviews(review='tizi') else: with open(raw_reviews_path, 'r') as fi: self.raw_reviews = [line.rstrip() for line in fi] self.data = [] for _ in range(6): self.data.append([]) self.categories = ['food', 'service', 'price', 'place'] self.conjunctions = [ "tetapi sayangnya", "namun", "tetapi", "walaupun", "akan tetapi", "sayangnya", "hanya sayang", "sayang", "meski", "walau", "but" ]
def main(): tokenizer = get_tokenizer(args.bert_vocab_path) train_data, dev_data, test_data, id2rel, rel2id, num_rels = load_data( args.train_path, args.dev_path, args.test_path, args.rel_dict_path) subject_model, object_model, hbt_model = E2EModel( args.bert_config_path, args.bert_checkpoint_path, args.LR, num_rels) # tensorflow os.environ["CUDA_VISIBLE_DEVICES"] = "0" if K.backend() == 'tensorflow': import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) STEPS = len(train_data) // args.BATCH_SIZE data_manager = data_generator(train_data, tokenizer, rel2id, num_rels, args.MAX_LEN, args.BATCH_SIZE) evaluator = Evaluate(subject_model, object_model, tokenizer, id2rel, dev_data, args.save_weights_path, args.save_model_path) hbt_model.fit_generator(data_manager.__iter__(), steps_per_epoch=STEPS, epochs=args.EPOCH, callbacks=[evaluator]) print("model training finish")
def model_predict(): os.environ["CUDA_VISIBLE_DEVICES"] = "0" if K.backend() == 'tensorflow': config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) tokenizer = get_tokenizer(args.bert_vocab_path) # read data and relation test_data, id2rel, rel2id, num_rels = load_data(args.test_path, args.rel_dict_path) # load model subject_model, object_model, hbt_model = E2EModel( args.bert_config_path, args.bert_checkpoint_path, args.LR, num_rels) hbt_model.load_weights(args.save_weights_path) isExactMatch = True if args.dataset == 'Wiki-KBP' else False if isExactMatch: print("Exact Match") else: print("Partial Match") precision, recall, f1_score = metric(subject_model, object_model, test_data, id2rel, tokenizer, isExactMatch, args.test_result_path) print(f'{precision}\t{recall}\t{f1_score}')
def predict(): os.environ["CUDA_VISIBLE_DEVICES"] = "0" if K.backend() == 'tensorflow': config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) tokenizer = get_tokenizer(args.bert_vocab_path) # read data and relation # test_data, id2rel, rel2id, num_rels = load_data(args.test_path, args.rel_dict_path) test_data = json.load(open(args.test_path, "r", encoding="utf-8")) id2rel, rel2id = json.load(open(args.rel_dict_path, "r", encoding="utf-8")) id2rel = {int(i): j for i, j in id2rel.items()} num_rels = len(id2rel) # load model subject_model, object_model, hbt_model = E2EModel( args.bert_config_path, args.bert_checkpoint_path, args.LR, num_rels) hbt_model.load_weights(args.save_weights_path) return_result = [] for line in test_data: sent = line["text"] result = extract_items(subject_model, object_model, tokenizer, sent, id2rel, h_bar=0.5, t_bar=0.5) return_result.append({"text": sent, "relation": result}) with open("./results/baidurelation2020/test_data_pred.json", "w", encoding="utf-8") as f: f.write(json.dumps(return_result, ensure_ascii=False, indent=2))
def load_pairs_text(filename, lowercase, language='en'): """ Read sent1 \t sent2 \t label :param filename: path to the file :param lowercase: whether to convert content to lower case :param language: language to use tokenizer (only used if input is in TSV format) :return: a list of tuples (first_sent, second_sent, label) """ logging.info('Reading data from %s' % filename) # we are only interested in the actual sentences + gold label # the corpus files has a few more things useful_data = [] # the SNLI corpus has one JSON object per line with codecs.open(filename, 'r', 'utf-8') as f: tokenize = utils.get_tokenizer(language) for line in f: line = line.strip() if lowercase: line = line.lower() sent1, sent2, label = line.split('\t') if label == '-': continue tokens1 = tokenize(sent1) tokens2 = tokenize(sent2) tokens1 = ['_BOS_'] + tokens1 tokens2 = ['_BOS_'] + tokens2 useful_data.append((tokens1, tokens2, label)) return useful_data
def model_predict(): max_len = 128 args = model_params() test_data, test_label, _, _ = load_data(args["test_file"]) print("test data size: ", len(test_data)) tokenizer = get_tokenizer(args["pretrain_model_path"]) test_x, len_list = create_infer_inputs(test_data, max_len, tokenizer) print("test data tokenizer: ", test_x[:3]) tag2id = { 'O': 0, 'B-LOC': 1, 'I-LOC': 2, 'B-PER': 3, 'I-PER': 4, 'B-ORG': 5, 'I-ORG': 6 } model = create_model(args["pretrain_model_path"], len(tag2id), args["dropout"]) model.load_weights("./output/ner_model.h5") pred_logits = model.predict(test_x) id2tag = {value: key for key, value in tag2id.items()} # shape [batch_size, seq_len] pred = np.argmax(pred_logits, axis=2).tolist() predict_label = [] for i in range(len(len_list)): temp = [] temp_pred = pred[i] for j in range(min(len_list[i], max_len)): temp.append(id2tag[temp_pred[j]]) predict_label.append(temp) print("predict label: ", predict_label)
def read_corpus(filename, lowercase, language='en', ratio = None): """ Read a JSONL or TSV file with the SNLI corpus :param filename: path to the file :param lowercase: whether to convert content to lower case :param language: language to use tokenizer (only used if input is in TSV format) :return: a list of tuples (first_sent, second_sent, label) """ # we are only interested in the actual sentences + gold label # the corpus files has a few more things useful_data = [] max_len = 0 # the SNLI corpus has one JSON object per line with open(filename, 'rb') as f: if filename.endswith('.tsv') or filename.endswith('.txt'): tokenize = utils.get_tokenizer(language) for line in f: line = line.decode('utf-8').strip() if lowercase: line = line.lower() sent1, sent2, label = line.split('\t') if label == '-': continue tokens1 = tokenize(sent1) tokens2 = tokenize(sent2) if ratio: if np.random.random() > np.float(ratio): max_len = max([len(tokens1), len(tokens2), max_len]) useful_data.append((tokens1, tokens2, label)) else: max_len = max([len(tokens1), len(tokens2), max_len]) useful_data.append((tokens1, tokens2, label)) else: for line in f: line = line.decode('utf-8') if lowercase: line = line.lower() data = json.loads(line) if data['gold_label'] == '-': # ignore items without a gold label continue sentence1_parse = data['sentence1_parse'] sentence2_parse = data['sentence2_parse'] label = data['gold_label'] tree1 = nltk.Tree.fromstring(sentence1_parse) tree2 = nltk.Tree.fromstring(sentence2_parse) tokens1 = tree1.leaves() tokens2 = tree2.leaves() t = (tokens1, tokens2, label) max_len = max([len(tokens1), len(tokens2), max_len]) useful_data.append(t) return useful_data, max_len
def __init__(self, data: pd.DataFrame, augment: bool = False): self._augment = augment self._tokenizer = get_tokenizer('bert') self._sentiment_ids = {'positive': 3893, 'negative': 4997, 'neutral': 8699} self._data_df = data self.exception_count = 0 self.exceptions = [] self.exception_mask = []
def __init__(self, data: pd.DataFrame, augment: bool = False): self._augment = augment self._tokenizer = get_tokenizer('xlnet') self._sentiment_ids = {'positive': 1654, 'negative': 2981, 'neutral': 9201} self._data_df = data self.exception_count = 0 self.exceptions = [] self.exception_mask = []
def __init__(self, data: pd.DataFrame, augment: bool = False): self._augment = augment self._tokenizer = get_tokenizer('roberta') self._sentiment_ids = {'positive': 1313, 'negative': 2430, 'neutral': 7974} self._data_df = data self.exception_count = 0 self.exceptions = [] self.exception_mask = []
def main(): args = params() tag2id_path = os.path.join(args["output_path"], args["tag2id"]) if not os.path.exists(args["output_path"]): os.makedirs(args["output_path"]) if not os.path.join(args["pb_path"]): os.makedirs(args["pb_path"]) tag2id = {"体育": 0, "健康": 1, "军事": 2, "教育": 3, "汽车": 4} max_len = args["max_len"] batch_size = args["batch_size"] epoch = args["epoch"] # load data data, label = load_data(args["data_file"], tag2id) logger.info("total data size: {}".format(len(data))) logger.info("total label size: {}".format(len(label))) # random 乱序 data, label = random_shuffle(data, label) # save tag2id save_dict(tag2id, tag2id_path) # label encoder total_label = label_encoder(label, len(tag2id)) # get train test data train_data, dev_data, train_label, dev_label = train_test_split( data, total_label, test_size=0.2) logger.info("train data size: {}".format(len(train_data))) logger.info("dev data size: {}".format(len(dev_data))) # bert tokenizer tokenizer = get_tokenizer() # tokenizer = get_roberta_tokenizer() # 准备模型数据 train_x, train_y = create_inputs_targets(train_data, train_label, max_len, tokenizer) dev_x, dev_y = create_inputs_targets(dev_data, dev_label, max_len, tokenizer) # create model bert # model = create_model(len(tag2id)) model = create_model(args["bert_model_name"], len(tag2id)) # model.summary() model.fit(train_x, train_y, epochs=epoch, verbose=1, batch_size=batch_size, validation_data=(dev_x, dev_y), validation_batch_size=batch_size) # , validation_split=0.1 # model save model_path = os.path.join(args["output_path"], "classification_model.h5") model.save_weights(model_path, overwrite=True) # save pb model tf.keras.models.save_model(model, args["pb_path"], save_format="tf", overwrite=True)
def main(): args = get_args() label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] df_train = pd.read_csv(args["train_file"]) train_datas = df_train['comment_text'].tolist() train_labels = df_train[label_cols].values.tolist() print("train data size: ", len(train_datas)) print("train label size: ", len(train_labels)) train_data, val_data, train_label, val_label = train_test_split(train_datas, train_labels, test_size=0.2, random_state=0) tokenizer = get_tokenizer(args["bert_model_name"], args["pretrain_model_path"]) train_x, train_y = get_model_data(train_data, train_label, tokenizer, args["max_length"]) val_x, val_y = get_model_data(val_data, val_label, tokenizer, args["max_length"]) label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] model = create_model(args["bert_model_name"], len(label_cols)) # 自定义计算f1 score # metrics = Metrics(val_x, val_y) # callbacks = [metrics] # 设置保存最优的模型,保存的是pb模型 callbacks = [ tf.keras.callbacks.ModelCheckpoint( # Path where to save the model # The two parameters below mean that we will overwrite # the current checkpoint if and only if # the `val_loss` score has improved. # The saved model name will include the current epoch. filepath="./output/model/1", # {epoch} save_best_only=True, # Only save a model if `val_loss` has improved. monitor='auc', # 'accuracy', verbose=1, ) ] model.fit(train_x, train_y, epochs=args["epoch"], verbose=1, batch_size=args["batch_size"], callbacks=callbacks, validation_data=(val_x, val_y), validation_batch_size=args["batch_size"]) if not os.path.exists(args["model_path"]): os.makedirs(args["model_path"]) model.save_weights(args["model_path"]) if not os.path.exists(args["pbmodel_path"]): os.makedirs(args["pbmodel_path"]) tf.keras.models.save_model(model, args["pbmodel_path"], save_format="tf")
def __init__(self, df: pd.DataFrame): self._data_df = df self._tokenizer = get_tokenizer('roberta') self._sentiment_ids = {'positive': 1313, 'negative': 2430, 'neutral': 7974} n_data = self._data_df.shape[0] self._input_ids = np.ones((n_data, Config.Train.max_len), dtype='int32') self._attention_mask = np.zeros((n_data, Config.Train.max_len), dtype='int32') self._token_type_ids = np.zeros((n_data, Config.Train.max_len), dtype='int32') self._start_tokens = np.zeros((n_data, Config.Train.max_len), dtype='int32') self._end_tokens = np.zeros((n_data, Config.Train.max_len), dtype='int32')
def __init__(self, sentence, word_dict, lowercase, language='en'): self.sentence = sentence tokenize = utils.get_tokenizer(language) if lowercase: pre_tokenize = sentence.lower() else: pre_tokenize = sentence self.tokens = tokenize(pre_tokenize) self.indices = [word_dict[token] for token in self.tokens_with_null] self.padding_index = word_dict[utils.PADDING]
def __init__(self, ckpt_path, max_seq_length=128, batch_size=32): print('load gpt2 scorer from', ckpt_path) ckpt_dir = os.path.dirname(ckpt_path) self.tokenizer = get_tokenizer('gpt2', ckpt_dir) self.criterion = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=self.tokenizer.pad_token_id) self.criterion.cuda() self.model = torch.load(ckpt_path) self.model.eval() self.model.cuda() self.max_seq_length = max_seq_length self.batch_size = batch_size
def __init__(self, ckpt_path, max_seq_length=128, batch_size=32): print('load bert proposal from', ckpt_path) # config_class, model_class, tokenizer_class = MODEL_CLASSES['bert'] ckpt_dir = os.path.dirname(ckpt_path) # config = config_class.from_pretrained(ckpt_dir) self.tokenizer = get_tokenizer('bert', ckpt_dir) self.model = torch.load(ckpt_path) self.model.eval() self.max_seq_length = max_seq_length self.batch_size = batch_size self.model.cuda()
def create_keras_sequences(data_path='../text/cleaned/'): """Previous stuff before running model train.""" logging.info('Loading and wrangling data.') lines = get_lines(data_path) tokenizer = get_tokenizer(lines) all_words_list = ordereddict_to_list(tokenizer.word_counts) p = p_distribution(lines) index = int(len(lines) * VAL_RATIO) return (TextSequence(lines[:-index], tokenizer, all_words_list, BATCH_SIZE, 8, 20, p), TextSequence(lines[-index:], tokenizer, all_words_list, BATCH_SIZE, 8, 20, p), len(tokenizer.word_index) + 1)
def predict_test(): print('\n>> Predicting on test') max_l = Config.Train.max_len test_df = pd.read_csv(Config.test_path) _test_generator = RobertaTestDataGenerator(test_df) test_dataset = tf.data.Dataset.from_generator(_test_generator.generate, output_types=({ 'ids': tf.int32, 'att': tf.int32, 'tti': tf.int32 })) test_dataset = test_dataset.padded_batch(Config.Train.batch_size, padded_shapes=({ 'ids': [max_l], 'att': [max_l], 'tti': [max_l] }), padding_values=({ 'ids': 1, 'att': 0, 'tti': 0 })) test_dataset = test_dataset.prefetch(tf.data.experimental.AUTOTUNE) model_dir = Config.Train.checkpoint_dir / Config.model_type start_idx = 0 end_idx = 0 model_count = len(list(model_dir.iterdir())) for i in range(model_count): model_path = model_dir / f'weights_{Config.version}_{i}.h5' model = get_roberta() model.load_weights(str(model_path)) start_idx, end_idx = model.predict(test_dataset, verbose=1) start_idx += start_idx end_idx += end_idx start_idx /= model_count end_idx /= model_count start_idx = np.argmax(start_idx, axis=-1) end_idx = np.argmax(end_idx, axis=-1) end_idx = np.where(start_idx > end_idx, start_idx, end_idx) tokenizer = get_tokenizer('roberta') selected_texts = [] for i, row in enumerate(test_df.itertuples(index=False, name='tweet')): a = start_idx[i] b = end_idx[i] text = ' ' + ' '.join(row.text.split()) encoded_text = tokenizer.encode(text) selected_text = tokenizer.decode(encoded_text.ids[a - 1:b]) selected_texts.append(selected_text) test_df['selected_text'] = selected_texts test_df.to_csv('test_predictions.csv', index=False) test_df[['textID', 'selected_text']].to_csv('submission.csv', index=False)
def main(test_data, args, label_num): # test_steps_per_epoch = len(test_data) // args["batch_size"] tokenizer = get_tokenizer(args['bert_model_name'], args['pretrain_model_path']) testdata = get_model_data(test_data, tokenizer, args["max_length"]) print("testdata: ", testdata) model = create_model(args['bert_model_name'], label_num) model.load_weights("./output/model/mulclassifition.h5") pred_logits = model.predict(testdata, batch_size=args["batch_size"]) pred = np.where(pred_logits >= 0.5, 1, 0).tolist() # pred = np.where(pred < 0.5, pred, 1).tolist() return pred
def text_classifier_predict(sentences, max_len, tag2id, bert_model_name, model_path): # get tokenizer tokenizer = get_tokenizer() test_x = create_infer_inputs(sentences, max_len, tokenizer) # id2tag id2tag = {value: key for key, value in tag2id.items()} # model model = create_model(bert_model_name, len(tag2id)) model.load_weights(model_path) logits = model.predict(test_x) pred = np.argmax(logits, axis=1).tolist() pred_label = [id2tag[i] for i in pred] print("preict label: ", pred_label)
def main(): args = get_args() df_test = pd.read_csv(args["test_file"]) test_data = df_test['comment_text'].values.tolist() label_cols = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] tokenizer = get_tokenizer(args['bert_model_name'], args['pretrain_model_path']) testdata = get_model_data(test_data, tokenizer, args["max_length"]) model = create_model(args["bert_model_name"], len(label_cols)) model.load_weights(args["model_path"]) pred_logits = model.predict(testdata) pred = np.where(pred_logits > 0.15, 1, 0).tolist() print(pred)
def __init__(self, verbose=False, rnn_type="gru"): self.max_nr_utterances = config.data["max_nr_utterances"] self.max_nr_words = config.data["max_nr_words"] self.corpus = config.corpus["corpus"] self.detail_level = config.corpus["detail_level"] self.verbose = verbose self.id2tag = get_id2tag(self.corpus, detail_level=self.detail_level) self.tag2id = {t: id for id, t in self.id2tag.items()} self.tag2full = get_tag2full_label(self.corpus, self.detail_level) self.n_tags = len(self.tag2id.keys()) self.tokenizer = get_tokenizer(rebuild_from_all_words=False) word2id = self.tokenizer.word_index # WARNING: if you force rebuild, the embedding matrix # may change and you may need to retrain the Neural Network! # set force rebuild to False when not changing total vocabulary self.embedding_matrix = get_embedding_matrix(word2id, force_rebuild=False) # use GPU os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true" factory = BiRNN_CRF_factory(self.embedding_matrix, self.n_tags, rnn_type) self.model = factory.get() data_name = self.corpus + "_detail_" + str(self.detail_level) checkpoint_path = "../trained_model/bilstm_crf/ckpt_" + data_name + ".hdf5" if os.path.exists(checkpoint_path): if self.verbose: print("loading trained weights...") self.model.load_weights(checkpoint_path) if self.verbose: print("Done!") else: print("WARNING: no model found in path, using untrained model!")
def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')): if vocab is None or tokenizer is None: tok_path = get_tokenizer() self.ptr_tokenizer = SentencepieceTokenizer(tok_path) self.ptr_detokenizer = SentencepieceDetokenizer(tok_path) _, vocab_of_gluonnlp = get_kobert_model() token2idx = vocab_of_gluonnlp.token_to_idx self.vocab = Vocabulary(token2idx=token2idx) self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen) else: self.vocab = vocab self.tokenizer = tokenizer self.maxlen = maxlen self.model_dir = model_dir
def run_squad_and_get_results( run_name: str, fsx_prefix: str, pre_layer_norm: bool, model_size: str, load_from: Union[str, tf.keras.Model], load_step: int, batch_size: int, checkpoint_frequency: Optional[int], validate_frequency: Optional[int], learning_rate: float, warmup_steps: int, total_steps: int, dataset: str, dummy_eval: bool = False, config: Optional[PretrainedConfig] = None, ) -> Dict: checkpoint_frequency = checkpoint_frequency or 1000000 validate_frequency = validate_frequency or 1000000 if isinstance(load_from, tf.keras.Model): config = load_from.config assert config is not None, "config may not be None" # Instantiate QuestionAnswering model if isinstance(load_from, TFPreTrainedModel): model = load_qa_from_pretrained(model=load_from) elif load_from == "scratch": model = TFAutoModelForQuestionAnswering.from_config(config) elif load_from == "huggingface": model = load_qa_from_pretrained(name=f"albert-{model_size}-v2") else: raise ValueError( f"'load_from' is '{load_from}'; must be in ['scratch', 'huggingface', 'amazon']" ) tokenizer = get_tokenizer() schedule = LinearWarmupPolyDecaySchedule( max_learning_rate=learning_rate, end_learning_rate=0, warmup_steps=warmup_steps, total_steps=total_steps, ) optimizer = tfa.optimizers.AdamW(weight_decay=0.0, learning_rate=schedule) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale="dynamic" ) # AMP model.call = wrap_tf_function_idempotent(model.call) if dataset == "squadv1": train_filename = "train-v1.1.json" val_filename = "dev-v1.1.json" processor = SquadV1Processor() elif dataset == "squadv2": train_filename = "train-v2.0.json" val_filename = "dev-v2.0.json" processor = SquadV2Processor() elif dataset == "debug": train_filename = "dev-v2.0.json" val_filename = "dev-v2.0.json" processor = SquadV2Processor() else: assert False, "--dataset must be one of ['squadv1', 'squadv2', 'debug']" data_dir = f"{fsx_prefix}/squad_data" train_dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=train_filename, batch_size=batch_size, shard=True, shuffle=True, repeat=True, drop_remainder=True, ) if hvd.rank() == 0: print("Starting finetuning") pbar = tqdm.tqdm(total_steps) summary_writer = None # Only create a writer if we make it through a successful step val_dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=val_filename, batch_size=batch_size, shard=False, shuffle=True, drop_remainder=False, ) # Need to re-wrap every time this function is called # Wrapping train_step gives an error with optimizer initialization on the second pass # of run_squad_and_get_results(). Bug report at https://github.com/tensorflow/tensorflow/issues/38875 # Discussion at https://github.com/tensorflow/tensorflow/issues/27120 wrapped_train_step = tf.function(train_step) for step, batch in enumerate(train_dataset): learning_rate = schedule(step=tf.constant(step, dtype=tf.float32)) loss, acc, exact_match, f1, precision, recall = wrapped_train_step( model=model, optimizer=optimizer, batch=batch ) # Broadcast model after the first step so parameters and optimizer are initialized if step == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) is_final_step = step >= total_steps - 1 if hvd.rank() == 0: do_checkpoint = (step % checkpoint_frequency == 0) or is_final_step do_validate = (step % validate_frequency == 0) or is_final_step pbar.update(1) description = f"Loss: {loss:.3f}, Acc: {acc:.3f}, EM: {exact_match:.3f}, F1: {f1:.3f}" pbar.set_description(description) if do_validate: print("Running validation") ( val_loss, val_acc, val_exact_match, val_f1, val_precision, val_recall, ) = run_validation(model=model, val_dataset=val_dataset) description = ( f"Step {step} validation - Loss: {val_loss:.3f}, Acc: {val_acc:.3f}, " f"EM: {val_exact_match:.3f}, F1: {val_f1:.3f}" ) print(description) print("Running evaluation") if dummy_eval: results = { "exact": 0.8169797018445212, "f1": 4.4469722448269335, "total": 11873, "HasAns_exact": 0.15182186234817813, "HasAns_f1": 7.422216845956518, "HasAns_total": 5928, "NoAns_exact": 1.4802354920100924, "NoAns_f1": 1.4802354920100924, "NoAns_total": 5945, "best_exact": 50.07159100480081, "best_exact_thresh": 0.0, "best_f1": 50.0772059855695, "best_f1_thresh": 0.0, } else: results: Dict = get_evaluation_metrics( model=model, data_dir=data_dir, filename=val_filename, batch_size=32, ) print_eval_metrics(results=results, step=step) if do_checkpoint: checkpoint_path = ( f"{fsx_prefix}/checkpoints/albert-squad/{run_name}-step{step}.ckpt" ) print(f"Saving checkpoint at {checkpoint_path}") model.save_weights(checkpoint_path) if summary_writer is None: summary_writer = tf.summary.create_file_writer( f"{fsx_prefix}/logs/albert-squad/{run_name}" ) with summary_writer.as_default(): tf.summary.scalar("learning_rate", learning_rate, step=step) tf.summary.scalar("train_loss", loss, step=step) tf.summary.scalar("train_acc", acc, step=step) tf.summary.scalar("train_exact", exact_match, step=step) tf.summary.scalar("train_f1", f1, step=step) tf.summary.scalar("train_precision", precision, step=step) tf.summary.scalar("train_recall", recall, step=step) if do_validate: tf.summary.scalar("val_loss", val_loss, step=step) tf.summary.scalar("val_acc", val_acc, step=step) tf.summary.scalar("val_exact", val_exact_match, step=step) tf.summary.scalar("val_f1", val_f1, step=step) tf.summary.scalar("val_precision", val_precision, step=step) tf.summary.scalar("val_recall", val_recall, step=step) # And the eval metrics tensorboard_eval_metrics( summary_writer=summary_writer, results=results, step=step ) if is_final_step: break # Can we return a value only on a single rank? if hvd.rank() == 0: pbar.close() print(f"Finished finetuning, job name {run_name}") return results
) g.set_title(title, fontsize=16) g.set_xlabel("Predicted Label", fontsize=14) g.set_ylabel("True Label", fontsize=14) plt.savefig(save_path, bbox_inches="tight") plt.show() conversations, labels = load_corpus_data(corpus, detail_level) conversations = chunk(conversations, max_nr_utterances) labels = chunk(labels, max_nr_utterances) n_tags = len(get_id2tag(corpus, detail_level=detail_level)) tokenizer = get_tokenizer(rebuild_from_all_words=False) word2id = tokenizer.word_index X, y = make_model_readable_data(conversations, labels, tokenizer, max_nr_utterances, max_nr_words) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True) # import pretrained embeddings # set force rebuild to False when not changing total vocabulary embedding_matrix = get_embedding_matrix(word2id, force_rebuild=False) # model = get_bilstm_crf_model(embedding_matrix, n_tags)
vocab_path = '../kogpt2/kogpt2_news_wiki_ko_cased_818bfa919d.spiece' return get_kogpt2_model(model_path, vocab_path, ctx) def load_kogpt2_model_from_checkpoint(kogpt2, load_path, device, ctx='cpu'): try: checkpoint = torch.load(load_path, map_location=device) kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(checkpoint['model_state_dict']) kogpt2model.eval() except: count = 0 kogpt2model, _ = load_kogpt2_model() else: count = int(re.findall("\d+", load_path)[1]) print(count) return kogpt2model, count if __name__ == "__main__": tok_path = get_tokenizer() model, vocab = get_pytorch_kogpt2_model() tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) PATH = '../model/pretrained_kogpt2.pth' torch.save(model.state_dict(), PATH)
import requests import traceback import numpy as np from flask import request from flask import Flask, jsonify from tensorflow.keras.preprocessing.sequence import pad_sequences from config import get_args from utils import get_tokenizer app = Flask(__name__) app.config["JSON_AS_ASCII"] = False args = get_args() tokenizer = get_tokenizer(args['bert_model_name'], args['pretrain_model_path']) def get_model_data(sentence, tokenizer, max_seq_len=128): dataset_dict = { "input_ids": [], "attention_mask": [], } input_ids = tokenizer.encode( sentence, # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' max_length=max_seq_len, # Truncate all sentences. ) sentence_length = len(input_ids) input_ids = pad_sequences([input_ids],
config = json.load(f) model_config = config["model"] pretraining_config = config["pretraining_setting"] gpu_config = config["gpu_setting"] checkpoint_dir = config["model_checkpoints"] glue_dataset_folder = config["glue_dataset_folder"] device_ids = list(range(torch.cuda.device_count())) print(f"GPU list: {device_ids}") print(json.dumps([model_config, pretraining_config], indent=4)) ########################### Loading Datasets ########################### tokenizer = utils.get_tokenizer(model_config["max_seq_len"]) model_config["vocab_size"] = len(tokenizer.get_vocab()) data_args = GlueDataTrainingArguments( task_name=args.task, data_dir=os.path.join(glue_dataset_folder, args.task), max_seq_length=model_config["max_seq_len"], overwrite_cache=True) train_dataset = GlueDataset(data_args, tokenizer=tokenizer) data_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=default_data_collator) num_steps_per_epoch = len(data_loader) print(f"num_steps_per_epoch: {num_steps_per_epoch}", flush=True)
@classmethod def get_true_length(cls, examples): assert cls.tokenizer is not None print(f'Tokenizer_type: {cls.tokenizer.name_or_path}, should check the n_real method.') examples['n'] = [sum(i) - 2 for i in examples['attention_mask']] examples['n_real'] = [sum([0 if cls.tokenizer.convert_ids_to_tokens(i).startswith('##') else 1 for i in line]) - 2 for line in examples['input_ids']] return examples if __name__ == '__main__': from utils import get_tokenizer from copy import deepcopy t=get_tokenizer('bert-base-chinese', is_zh=True) ds = get_tokenized_ds('hfds_scripts/atec_dataset.py', '../sentence-embedding/data/ATEC/atec_nlp_sim_train.csv', t, tokenize_type='with_prefix') ds = ds['atec'] ds2=deepcopy(ds) for index, ds_ in enumerate([ds, ds2]): features=list(ds_.features) for feature in features: if index: if feature.startswith('textb') or feature == 'label': ds_.remove_columns_(feature) else: ds_.rename_column_(feature, feature[6:]) else: if feature.startswith('texta') or feature == 'label':
data_folder = os.path.join(curr_path, "datasets", config["data_folder"]) if args.batch_size is not None: pretraining_config["batch_size"] = args.batch_size if args.num_batch is not None: pretraining_config["validate_batches_per_epoch"] = args.num_batch device_ids = list(range(torch.cuda.device_count())) print(f"GPU list: {device_ids}") print(json.dumps([model_config, pretraining_config], indent=4)) ########################### Loading Dataset ########################### tokenizer = utils.get_tokenizer(os.path.join(curr_path, 'roberta-base'), model_config["max_seq_len"]) model_config["vocab_size"] = len(tokenizer.get_vocab()) if "dataset" not in config: config["dataset"] = None dataset = CorpusDataset(folder_path=data_folder, file_json="dev.json", option=config["dataset"]) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) data_loader = DataLoader(dataset, batch_size=pretraining_config["batch_size"], collate_fn=data_collator) pretrain_dataloader_iter = enumerate(data_loader)