예제 #1
0
    def __init__(self, sentence_tokenizer='normal', raw_reviews_path=None):
        """
            Sentence Tokenizers: normal, punkt
        """
        self.sent_tokenize = sent_tokenize
        if sentence_tokenizer == 'normal':
            pass
        elif sentence_tokenizer == 'punkt':
            self.sent_tokenize = nltk.data.load(
                'tokenizers/punkt/english.pickle').tokenize

        self.tokenizer = utils.get_tokenizer()

        if utils.is_none(raw_reviews_path):
            self.raw_reviews = utils.get_raw_test_reviews(review='tizi')
        else:
            with open(raw_reviews_path, 'r') as fi:
                self.raw_reviews = [line.rstrip() for line in fi]

        self.data = []
        for _ in range(6):
            self.data.append([])

        self.categories = ['food', 'service', 'price', 'place']
        self.conjunctions = [
            "tetapi sayangnya", "namun", "tetapi", "walaupun", "akan tetapi",
            "sayangnya", "hanya sayang", "sayang", "meski", "walau", "but"
        ]
예제 #2
0
def main():
    tokenizer = get_tokenizer(args.bert_vocab_path)
    train_data, dev_data, test_data, id2rel, rel2id, num_rels = load_data(
        args.train_path, args.dev_path, args.test_path, args.rel_dict_path)
    subject_model, object_model, hbt_model = E2EModel(
        args.bert_config_path, args.bert_checkpoint_path, args.LR, num_rels)
    # tensorflow
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    if K.backend() == 'tensorflow':
        import tensorflow as tf
        from keras.backend.tensorflow_backend import set_session

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)

    STEPS = len(train_data) // args.BATCH_SIZE
    data_manager = data_generator(train_data, tokenizer, rel2id, num_rels,
                                  args.MAX_LEN, args.BATCH_SIZE)
    evaluator = Evaluate(subject_model, object_model, tokenizer, id2rel,
                         dev_data, args.save_weights_path,
                         args.save_model_path)
    hbt_model.fit_generator(data_manager.__iter__(),
                            steps_per_epoch=STEPS,
                            epochs=args.EPOCH,
                            callbacks=[evaluator])
    print("model training finish")
예제 #3
0
def model_predict():
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    if K.backend() == 'tensorflow':
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)

    tokenizer = get_tokenizer(args.bert_vocab_path)
    # read data and relation
    test_data, id2rel, rel2id, num_rels = load_data(args.test_path,
                                                    args.rel_dict_path)
    # load model
    subject_model, object_model, hbt_model = E2EModel(
        args.bert_config_path, args.bert_checkpoint_path, args.LR, num_rels)
    hbt_model.load_weights(args.save_weights_path)

    isExactMatch = True if args.dataset == 'Wiki-KBP' else False
    if isExactMatch:
        print("Exact Match")
    else:
        print("Partial Match")
    precision, recall, f1_score = metric(subject_model, object_model,
                                         test_data, id2rel, tokenizer,
                                         isExactMatch, args.test_result_path)
    print(f'{precision}\t{recall}\t{f1_score}')
예제 #4
0
def predict():
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    if K.backend() == 'tensorflow':
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)

    tokenizer = get_tokenizer(args.bert_vocab_path)
    # read data and relation
    # test_data, id2rel, rel2id, num_rels = load_data(args.test_path, args.rel_dict_path)
    test_data = json.load(open(args.test_path, "r", encoding="utf-8"))
    id2rel, rel2id = json.load(open(args.rel_dict_path, "r", encoding="utf-8"))

    id2rel = {int(i): j for i, j in id2rel.items()}
    num_rels = len(id2rel)
    # load model
    subject_model, object_model, hbt_model = E2EModel(
        args.bert_config_path, args.bert_checkpoint_path, args.LR, num_rels)
    hbt_model.load_weights(args.save_weights_path)
    return_result = []
    for line in test_data:
        sent = line["text"]
        result = extract_items(subject_model,
                               object_model,
                               tokenizer,
                               sent,
                               id2rel,
                               h_bar=0.5,
                               t_bar=0.5)
        return_result.append({"text": sent, "relation": result})

    with open("./results/baidurelation2020/test_data_pred.json",
              "w",
              encoding="utf-8") as f:
        f.write(json.dumps(return_result, ensure_ascii=False, indent=2))
예제 #5
0
def load_pairs_text(filename, lowercase, language='en'):
    """
    Read sent1 \t sent2 \t label

    :param filename: path to the file
    :param lowercase: whether to convert content to lower case
    :param language: language to use tokenizer (only used if input is in
        TSV format)
    :return: a list of tuples (first_sent, second_sent, label)
    """
    logging.info('Reading data from %s' % filename)
    # we are only interested in the actual sentences + gold label
    # the corpus files has a few more things
    useful_data = []
    # the SNLI corpus has one JSON object per line
    with codecs.open(filename, 'r', 'utf-8') as f:
        tokenize = utils.get_tokenizer(language)
        for line in f:
            line = line.strip()
            if lowercase:
                line = line.lower()
            sent1, sent2, label = line.split('\t')
            if label == '-':
                continue
            tokens1 = tokenize(sent1)
            tokens2 = tokenize(sent2)
            tokens1 = ['_BOS_'] + tokens1
            tokens2 = ['_BOS_'] + tokens2
            useful_data.append((tokens1, tokens2, label))
    return useful_data
예제 #6
0
def model_predict():
    max_len = 128
    args = model_params()
    test_data, test_label, _, _ = load_data(args["test_file"])
    print("test data size: ", len(test_data))
    tokenizer = get_tokenizer(args["pretrain_model_path"])
    test_x, len_list = create_infer_inputs(test_data, max_len, tokenizer)
    print("test data tokenizer: ", test_x[:3])
    tag2id = {
        'O': 0,
        'B-LOC': 1,
        'I-LOC': 2,
        'B-PER': 3,
        'I-PER': 4,
        'B-ORG': 5,
        'I-ORG': 6
    }
    model = create_model(args["pretrain_model_path"], len(tag2id),
                         args["dropout"])
    model.load_weights("./output/ner_model.h5")
    pred_logits = model.predict(test_x)
    id2tag = {value: key for key, value in tag2id.items()}
    # shape [batch_size, seq_len]
    pred = np.argmax(pred_logits, axis=2).tolist()
    predict_label = []
    for i in range(len(len_list)):
        temp = []
        temp_pred = pred[i]
        for j in range(min(len_list[i], max_len)):
            temp.append(id2tag[temp_pred[j]])
        predict_label.append(temp)
    print("predict label: ", predict_label)
예제 #7
0
def read_corpus(filename, lowercase, language='en', ratio = None):
    """
    Read a JSONL or TSV file with the SNLI corpus

    :param filename: path to the file
    :param lowercase: whether to convert content to lower case
    :param language: language to use tokenizer (only used if input is in
        TSV format)
    :return: a list of tuples (first_sent, second_sent, label)
    """
    # we are only interested in the actual sentences + gold label
    # the corpus files has a few more things
    useful_data = []
    max_len = 0
    # the SNLI corpus has one JSON object per line
    with open(filename, 'rb') as f:

        if filename.endswith('.tsv') or filename.endswith('.txt'):

            tokenize = utils.get_tokenizer(language)
            for line in f:
                line = line.decode('utf-8').strip()
                if lowercase:
                    line = line.lower()
                sent1, sent2, label = line.split('\t')
                if label == '-':
                    continue
                tokens1 = tokenize(sent1)
                tokens2 = tokenize(sent2)
                if ratio:
                    if np.random.random() > np.float(ratio):
                        max_len = max([len(tokens1), len(tokens2), max_len])
                        useful_data.append((tokens1, tokens2, label))
                else:
                    max_len = max([len(tokens1), len(tokens2), max_len])
                    useful_data.append((tokens1, tokens2, label))
        else:
            for line in f:
                line = line.decode('utf-8')
                if lowercase:
                    line = line.lower()
                data = json.loads(line)
                if data['gold_label'] == '-':
                    # ignore items without a gold label
                    continue

                sentence1_parse = data['sentence1_parse']
                sentence2_parse = data['sentence2_parse']
                label = data['gold_label']

                tree1 = nltk.Tree.fromstring(sentence1_parse)
                tree2 = nltk.Tree.fromstring(sentence2_parse)
                tokens1 = tree1.leaves()
                tokens2 = tree2.leaves()
                t = (tokens1, tokens2, label)
                max_len = max([len(tokens1), len(tokens2), max_len])
                useful_data.append(t)

    return useful_data, max_len
예제 #8
0
 def __init__(self, data: pd.DataFrame, augment: bool = False):
     self._augment = augment
     self._tokenizer = get_tokenizer('bert')
     self._sentiment_ids = {'positive': 3893, 'negative': 4997, 'neutral': 8699}
     self._data_df = data
     self.exception_count = 0
     self.exceptions = []
     self.exception_mask = []
예제 #9
0
 def __init__(self, data: pd.DataFrame, augment: bool = False):
     self._augment = augment
     self._tokenizer = get_tokenizer('xlnet')
     self._sentiment_ids = {'positive': 1654, 'negative': 2981, 'neutral': 9201}
     self._data_df = data
     self.exception_count = 0
     self.exceptions = []
     self.exception_mask = []
예제 #10
0
 def __init__(self, data: pd.DataFrame, augment: bool = False):
     self._augment = augment
     self._tokenizer = get_tokenizer('roberta')
     self._sentiment_ids = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
     self._data_df = data
     self.exception_count = 0
     self.exceptions = []
     self.exception_mask = []
예제 #11
0
def main():
    args = params()
    tag2id_path = os.path.join(args["output_path"], args["tag2id"])

    if not os.path.exists(args["output_path"]):
        os.makedirs(args["output_path"])
    if not os.path.join(args["pb_path"]):
        os.makedirs(args["pb_path"])
    tag2id = {"体育": 0, "健康": 1, "军事": 2, "教育": 3, "汽车": 4}
    max_len = args["max_len"]
    batch_size = args["batch_size"]
    epoch = args["epoch"]
    # load data
    data, label = load_data(args["data_file"], tag2id)
    logger.info("total data size: {}".format(len(data)))
    logger.info("total label size: {}".format(len(label)))
    # random 乱序
    data, label = random_shuffle(data, label)
    # save tag2id
    save_dict(tag2id, tag2id_path)
    # label encoder
    total_label = label_encoder(label, len(tag2id))

    # get train test data
    train_data, dev_data, train_label, dev_label = train_test_split(
        data, total_label, test_size=0.2)
    logger.info("train data size: {}".format(len(train_data)))
    logger.info("dev data size: {}".format(len(dev_data)))
    # bert tokenizer
    tokenizer = get_tokenizer()
    # tokenizer = get_roberta_tokenizer()
    # 准备模型数据
    train_x, train_y = create_inputs_targets(train_data, train_label, max_len,
                                             tokenizer)
    dev_x, dev_y = create_inputs_targets(dev_data, dev_label, max_len,
                                         tokenizer)

    # create model bert
    # model = create_model(len(tag2id))
    model = create_model(args["bert_model_name"], len(tag2id))
    # model.summary()
    model.fit(train_x,
              train_y,
              epochs=epoch,
              verbose=1,
              batch_size=batch_size,
              validation_data=(dev_x, dev_y),
              validation_batch_size=batch_size)  # , validation_split=0.1

    # model save
    model_path = os.path.join(args["output_path"], "classification_model.h5")
    model.save_weights(model_path, overwrite=True)

    # save pb model
    tf.keras.models.save_model(model,
                               args["pb_path"],
                               save_format="tf",
                               overwrite=True)
예제 #12
0
def main():
    args = get_args()
    label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    df_train = pd.read_csv(args["train_file"])
    train_datas = df_train['comment_text'].tolist()
    train_labels = df_train[label_cols].values.tolist()
    print("train data size: ", len(train_datas))
    print("train label size: ", len(train_labels))

    train_data, val_data, train_label, val_label = train_test_split(train_datas,
                                                                    train_labels,
                                                                    test_size=0.2,
                                                                    random_state=0)

    tokenizer = get_tokenizer(args["bert_model_name"],
                              args["pretrain_model_path"])

    train_x, train_y = get_model_data(train_data, train_label, tokenizer,
                                      args["max_length"])

    val_x, val_y = get_model_data(val_data, val_label, tokenizer, args["max_length"])
    label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    model = create_model(args["bert_model_name"], len(label_cols))

    # 自定义计算f1 score
    # metrics = Metrics(val_x, val_y)
    # callbacks = [metrics]

    # 设置保存最优的模型,保存的是pb模型
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            # Path where to save the model
            # The two parameters below mean that we will overwrite
            # the current checkpoint if and only if
            # the `val_loss` score has improved.
            # The saved model name will include the current epoch.
            filepath="./output/model/1",   # {epoch}
            save_best_only=True,  # Only save a model if `val_loss` has improved.
            monitor='auc',   # 'accuracy',
            verbose=1,
        )
    ]

    model.fit(train_x, train_y, epochs=args["epoch"], verbose=1,
              batch_size=args["batch_size"],
              callbacks=callbacks,
              validation_data=(val_x, val_y),
              validation_batch_size=args["batch_size"])

    if not os.path.exists(args["model_path"]):
        os.makedirs(args["model_path"])

    model.save_weights(args["model_path"])

    if not os.path.exists(args["pbmodel_path"]):
        os.makedirs(args["pbmodel_path"])
    tf.keras.models.save_model(model, args["pbmodel_path"], save_format="tf")
예제 #13
0
 def __init__(self, df: pd.DataFrame):
     self._data_df = df
     self._tokenizer = get_tokenizer('roberta')
     self._sentiment_ids = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
     n_data = self._data_df.shape[0]
     self._input_ids = np.ones((n_data, Config.Train.max_len), dtype='int32')
     self._attention_mask = np.zeros((n_data, Config.Train.max_len), dtype='int32')
     self._token_type_ids = np.zeros((n_data, Config.Train.max_len), dtype='int32')
     self._start_tokens = np.zeros((n_data, Config.Train.max_len), dtype='int32')
     self._end_tokens = np.zeros((n_data, Config.Train.max_len), dtype='int32')
예제 #14
0
 def __init__(self, sentence, word_dict, lowercase, language='en'):
     self.sentence = sentence
     tokenize = utils.get_tokenizer(language)
     if lowercase:
         pre_tokenize = sentence.lower()
     else:
         pre_tokenize = sentence
     self.tokens = tokenize(pre_tokenize)
     self.indices = [word_dict[token] for token in self.tokens_with_null]
     self.padding_index = word_dict[utils.PADDING]
예제 #15
0
 def __init__(self, ckpt_path, max_seq_length=128, batch_size=32):
     print('load gpt2 scorer from', ckpt_path)
     ckpt_dir = os.path.dirname(ckpt_path)
     self.tokenizer = get_tokenizer('gpt2', ckpt_dir)
     self.criterion = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=self.tokenizer.pad_token_id)
     self.criterion.cuda()
     self.model = torch.load(ckpt_path)
     self.model.eval()
     self.model.cuda()
     self.max_seq_length = max_seq_length
     self.batch_size = batch_size
예제 #16
0
 def __init__(self, ckpt_path, max_seq_length=128, batch_size=32):
     print('load bert proposal from', ckpt_path)
     # config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
     ckpt_dir = os.path.dirname(ckpt_path)
     # config = config_class.from_pretrained(ckpt_dir)
     self.tokenizer = get_tokenizer('bert', ckpt_dir)
     self.model = torch.load(ckpt_path)
     self.model.eval()
     self.max_seq_length = max_seq_length
     self.batch_size = batch_size
     self.model.cuda()
예제 #17
0
def create_keras_sequences(data_path='../text/cleaned/'):
    """Previous stuff before running model train."""
    logging.info('Loading and wrangling data.')
    lines = get_lines(data_path)
    tokenizer = get_tokenizer(lines)
    all_words_list = ordereddict_to_list(tokenizer.word_counts)
    p = p_distribution(lines)
    index = int(len(lines) * VAL_RATIO)
    return (TextSequence(lines[:-index], tokenizer, all_words_list, BATCH_SIZE,
                         8, 20, p),
            TextSequence(lines[-index:], tokenizer, all_words_list, BATCH_SIZE,
                         8, 20, p), len(tokenizer.word_index) + 1)
예제 #18
0
파일: test.py 프로젝트: ashwan1/TSE-2020
def predict_test():
    print('\n>> Predicting on test')
    max_l = Config.Train.max_len
    test_df = pd.read_csv(Config.test_path)
    _test_generator = RobertaTestDataGenerator(test_df)
    test_dataset = tf.data.Dataset.from_generator(_test_generator.generate,
                                                  output_types=({
                                                      'ids': tf.int32,
                                                      'att': tf.int32,
                                                      'tti': tf.int32
                                                  }))
    test_dataset = test_dataset.padded_batch(Config.Train.batch_size,
                                             padded_shapes=({
                                                 'ids': [max_l],
                                                 'att': [max_l],
                                                 'tti': [max_l]
                                             }),
                                             padding_values=({
                                                 'ids': 1,
                                                 'att': 0,
                                                 'tti': 0
                                             }))
    test_dataset = test_dataset.prefetch(tf.data.experimental.AUTOTUNE)

    model_dir = Config.Train.checkpoint_dir / Config.model_type
    start_idx = 0
    end_idx = 0
    model_count = len(list(model_dir.iterdir()))
    for i in range(model_count):
        model_path = model_dir / f'weights_{Config.version}_{i}.h5'
        model = get_roberta()
        model.load_weights(str(model_path))
        start_idx, end_idx = model.predict(test_dataset, verbose=1)
        start_idx += start_idx
        end_idx += end_idx
    start_idx /= model_count
    end_idx /= model_count
    start_idx = np.argmax(start_idx, axis=-1)
    end_idx = np.argmax(end_idx, axis=-1)
    end_idx = np.where(start_idx > end_idx, start_idx, end_idx)
    tokenizer = get_tokenizer('roberta')
    selected_texts = []
    for i, row in enumerate(test_df.itertuples(index=False, name='tweet')):
        a = start_idx[i]
        b = end_idx[i]
        text = ' ' + ' '.join(row.text.split())
        encoded_text = tokenizer.encode(text)
        selected_text = tokenizer.decode(encoded_text.ids[a - 1:b])
        selected_texts.append(selected_text)
    test_df['selected_text'] = selected_texts
    test_df.to_csv('test_predictions.csv', index=False)
    test_df[['textID', 'selected_text']].to_csv('submission.csv', index=False)
예제 #19
0
def main(test_data, args, label_num):
    # test_steps_per_epoch = len(test_data) // args["batch_size"]
    tokenizer = get_tokenizer(args['bert_model_name'],
                              args['pretrain_model_path'])
    testdata = get_model_data(test_data, tokenizer, args["max_length"])
    print("testdata: ", testdata)
    model = create_model(args['bert_model_name'], label_num)
    model.load_weights("./output/model/mulclassifition.h5")

    pred_logits = model.predict(testdata, batch_size=args["batch_size"])
    pred = np.where(pred_logits >= 0.5, 1, 0).tolist()
    # pred = np.where(pred < 0.5, pred, 1).tolist()
    return pred
예제 #20
0
def text_classifier_predict(sentences, max_len, tag2id, bert_model_name,
                            model_path):
    # get tokenizer
    tokenizer = get_tokenizer()
    test_x = create_infer_inputs(sentences, max_len, tokenizer)
    # id2tag
    id2tag = {value: key for key, value in tag2id.items()}
    # model
    model = create_model(bert_model_name, len(tag2id))
    model.load_weights(model_path)
    logits = model.predict(test_x)
    pred = np.argmax(logits, axis=1).tolist()
    pred_label = [id2tag[i] for i in pred]
    print("preict label: ", pred_label)
예제 #21
0
def main():
    args = get_args()
    df_test = pd.read_csv(args["test_file"])
    test_data = df_test['comment_text'].values.tolist()
    label_cols = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    tokenizer = get_tokenizer(args['bert_model_name'],
                              args['pretrain_model_path'])
    testdata = get_model_data(test_data, tokenizer, args["max_length"])

    model = create_model(args["bert_model_name"], len(label_cols))

    model.load_weights(args["model_path"])
    pred_logits = model.predict(testdata)
    pred = np.where(pred_logits > 0.15, 1, 0).tolist()
    print(pred)
예제 #22
0
    def __init__(self, verbose=False, rnn_type="gru"):

        self.max_nr_utterances = config.data["max_nr_utterances"]
        self.max_nr_words = config.data["max_nr_words"]
        self.corpus = config.corpus["corpus"]
        self.detail_level = config.corpus["detail_level"]

        self.verbose = verbose

        self.id2tag = get_id2tag(self.corpus, detail_level=self.detail_level)
        self.tag2id = {t: id for id, t in self.id2tag.items()}
        self.tag2full = get_tag2full_label(self.corpus, self.detail_level)
        self.n_tags = len(self.tag2id.keys())

        self.tokenizer = get_tokenizer(rebuild_from_all_words=False)
        word2id = self.tokenizer.word_index

        # WARNING: if you force rebuild, the embedding matrix
        # may change and you may need to retrain the Neural Network!

        # set force rebuild to False when not changing total vocabulary
        self.embedding_matrix = get_embedding_matrix(word2id,
                                                     force_rebuild=False)

        # use GPU
        os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"

        factory = BiRNN_CRF_factory(self.embedding_matrix, self.n_tags,
                                    rnn_type)
        self.model = factory.get()

        data_name = self.corpus + "_detail_" + str(self.detail_level)
        checkpoint_path = "../trained_model/bilstm_crf/ckpt_" + data_name + ".hdf5"
        if os.path.exists(checkpoint_path):
            if self.verbose:
                print("loading trained weights...")
            self.model.load_weights(checkpoint_path)
            if self.verbose:
                print("Done!")
        else:
            print("WARNING: no model found in path, using untrained model!")
예제 #23
0
    def __init__(self,
                 vocab=None,
                 tokenizer=None,
                 maxlen=30,
                 model_dir=Path('data_in')):

        if vocab is None or tokenizer is None:
            tok_path = get_tokenizer()
            self.ptr_tokenizer = SentencepieceTokenizer(tok_path)
            self.ptr_detokenizer = SentencepieceDetokenizer(tok_path)
            _, vocab_of_gluonnlp = get_kobert_model()
            token2idx = vocab_of_gluonnlp.token_to_idx
            self.vocab = Vocabulary(token2idx=token2idx)
            self.tokenizer = Tokenizer(vocab=self.vocab,
                                       split_fn=self.ptr_tokenizer,
                                       pad_fn=keras_pad_fn,
                                       maxlen=maxlen)
        else:
            self.vocab = vocab
            self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.model_dir = model_dir
예제 #24
0
def run_squad_and_get_results(
    run_name: str,
    fsx_prefix: str,
    pre_layer_norm: bool,
    model_size: str,
    load_from: Union[str, tf.keras.Model],
    load_step: int,
    batch_size: int,
    checkpoint_frequency: Optional[int],
    validate_frequency: Optional[int],
    learning_rate: float,
    warmup_steps: int,
    total_steps: int,
    dataset: str,
    dummy_eval: bool = False,
    config: Optional[PretrainedConfig] = None,
) -> Dict:
    checkpoint_frequency = checkpoint_frequency or 1000000
    validate_frequency = validate_frequency or 1000000

    if isinstance(load_from, tf.keras.Model):
        config = load_from.config
    assert config is not None, "config may not be None"

    # Instantiate QuestionAnswering model
    if isinstance(load_from, TFPreTrainedModel):
        model = load_qa_from_pretrained(model=load_from)
    elif load_from == "scratch":
        model = TFAutoModelForQuestionAnswering.from_config(config)
    elif load_from == "huggingface":
        model = load_qa_from_pretrained(name=f"albert-{model_size}-v2")
    else:
        raise ValueError(
            f"'load_from' is '{load_from}'; must be in ['scratch', 'huggingface', 'amazon']"
        )

    tokenizer = get_tokenizer()

    schedule = LinearWarmupPolyDecaySchedule(
        max_learning_rate=learning_rate,
        end_learning_rate=0,
        warmup_steps=warmup_steps,
        total_steps=total_steps,
    )
    optimizer = tfa.optimizers.AdamW(weight_decay=0.0, learning_rate=schedule)
    optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
        optimizer, loss_scale="dynamic"
    )  # AMP

    model.call = wrap_tf_function_idempotent(model.call)

    if dataset == "squadv1":
        train_filename = "train-v1.1.json"
        val_filename = "dev-v1.1.json"
        processor = SquadV1Processor()
    elif dataset == "squadv2":
        train_filename = "train-v2.0.json"
        val_filename = "dev-v2.0.json"
        processor = SquadV2Processor()
    elif dataset == "debug":
        train_filename = "dev-v2.0.json"
        val_filename = "dev-v2.0.json"
        processor = SquadV2Processor()
    else:
        assert False, "--dataset must be one of ['squadv1', 'squadv2', 'debug']"

    data_dir = f"{fsx_prefix}/squad_data"

    train_dataset = get_dataset(
        tokenizer=tokenizer,
        processor=processor,
        data_dir=data_dir,
        filename=train_filename,
        batch_size=batch_size,
        shard=True,
        shuffle=True,
        repeat=True,
        drop_remainder=True,
    )

    if hvd.rank() == 0:
        print("Starting finetuning")
        pbar = tqdm.tqdm(total_steps)
        summary_writer = None  # Only create a writer if we make it through a successful step
        val_dataset = get_dataset(
            tokenizer=tokenizer,
            processor=processor,
            data_dir=data_dir,
            filename=val_filename,
            batch_size=batch_size,
            shard=False,
            shuffle=True,
            drop_remainder=False,
        )

    # Need to re-wrap every time this function is called
    # Wrapping train_step gives an error with optimizer initialization on the second pass
    # of run_squad_and_get_results(). Bug report at https://github.com/tensorflow/tensorflow/issues/38875
    # Discussion at https://github.com/tensorflow/tensorflow/issues/27120
    wrapped_train_step = tf.function(train_step)
    for step, batch in enumerate(train_dataset):
        learning_rate = schedule(step=tf.constant(step, dtype=tf.float32))
        loss, acc, exact_match, f1, precision, recall = wrapped_train_step(
            model=model, optimizer=optimizer, batch=batch
        )

        # Broadcast model after the first step so parameters and optimizer are initialized
        if step == 0:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        is_final_step = step >= total_steps - 1
        if hvd.rank() == 0:
            do_checkpoint = (step % checkpoint_frequency == 0) or is_final_step
            do_validate = (step % validate_frequency == 0) or is_final_step

            pbar.update(1)
            description = f"Loss: {loss:.3f}, Acc: {acc:.3f}, EM: {exact_match:.3f}, F1: {f1:.3f}"
            pbar.set_description(description)

            if do_validate:
                print("Running validation")
                (
                    val_loss,
                    val_acc,
                    val_exact_match,
                    val_f1,
                    val_precision,
                    val_recall,
                ) = run_validation(model=model, val_dataset=val_dataset)
                description = (
                    f"Step {step} validation - Loss: {val_loss:.3f}, Acc: {val_acc:.3f}, "
                    f"EM: {val_exact_match:.3f}, F1: {val_f1:.3f}"
                )
                print(description)
                print("Running evaluation")
                if dummy_eval:
                    results = {
                        "exact": 0.8169797018445212,
                        "f1": 4.4469722448269335,
                        "total": 11873,
                        "HasAns_exact": 0.15182186234817813,
                        "HasAns_f1": 7.422216845956518,
                        "HasAns_total": 5928,
                        "NoAns_exact": 1.4802354920100924,
                        "NoAns_f1": 1.4802354920100924,
                        "NoAns_total": 5945,
                        "best_exact": 50.07159100480081,
                        "best_exact_thresh": 0.0,
                        "best_f1": 50.0772059855695,
                        "best_f1_thresh": 0.0,
                    }
                else:
                    results: Dict = get_evaluation_metrics(
                        model=model, data_dir=data_dir, filename=val_filename, batch_size=32,
                    )
                print_eval_metrics(results=results, step=step)

            if do_checkpoint:
                checkpoint_path = (
                    f"{fsx_prefix}/checkpoints/albert-squad/{run_name}-step{step}.ckpt"
                )
                print(f"Saving checkpoint at {checkpoint_path}")
                model.save_weights(checkpoint_path)

            if summary_writer is None:
                summary_writer = tf.summary.create_file_writer(
                    f"{fsx_prefix}/logs/albert-squad/{run_name}"
                )
            with summary_writer.as_default():
                tf.summary.scalar("learning_rate", learning_rate, step=step)
                tf.summary.scalar("train_loss", loss, step=step)
                tf.summary.scalar("train_acc", acc, step=step)
                tf.summary.scalar("train_exact", exact_match, step=step)
                tf.summary.scalar("train_f1", f1, step=step)
                tf.summary.scalar("train_precision", precision, step=step)
                tf.summary.scalar("train_recall", recall, step=step)
                if do_validate:
                    tf.summary.scalar("val_loss", val_loss, step=step)
                    tf.summary.scalar("val_acc", val_acc, step=step)
                    tf.summary.scalar("val_exact", val_exact_match, step=step)
                    tf.summary.scalar("val_f1", val_f1, step=step)
                    tf.summary.scalar("val_precision", val_precision, step=step)
                    tf.summary.scalar("val_recall", val_recall, step=step)
                    # And the eval metrics
                    tensorboard_eval_metrics(
                        summary_writer=summary_writer, results=results, step=step
                    )

        if is_final_step:
            break

    # Can we return a value only on a single rank?
    if hvd.rank() == 0:
        pbar.close()
        print(f"Finished finetuning, job name {run_name}")
        return results
예제 #25
0
    )
    g.set_title(title, fontsize=16)
    g.set_xlabel("Predicted Label", fontsize=14)
    g.set_ylabel("True Label", fontsize=14)
    plt.savefig(save_path, bbox_inches="tight")
    plt.show()


conversations, labels = load_corpus_data(corpus, detail_level)

conversations = chunk(conversations, max_nr_utterances)
labels = chunk(labels, max_nr_utterances)

n_tags = len(get_id2tag(corpus, detail_level=detail_level))

tokenizer = get_tokenizer(rebuild_from_all_words=False)
word2id = tokenizer.word_index

X, y = make_model_readable_data(conversations, labels, tokenizer,
                                max_nr_utterances, max_nr_words)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    shuffle=True)

# import pretrained embeddings
# set force rebuild to False when not changing total vocabulary
embedding_matrix = get_embedding_matrix(word2id, force_rebuild=False)

# model = get_bilstm_crf_model(embedding_matrix, n_tags)
    vocab_path = '../kogpt2/kogpt2_news_wiki_ko_cased_818bfa919d.spiece'

    return get_kogpt2_model(model_path, vocab_path, ctx)


def load_kogpt2_model_from_checkpoint(kogpt2, load_path, device, ctx='cpu'):
    try:
        checkpoint = torch.load(load_path, map_location=device)
        
        kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
        kogpt2model.load_state_dict(checkpoint['model_state_dict'])

        kogpt2model.eval()
    except:
        count = 0
        kogpt2model, _ = load_kogpt2_model()
    else:
        count = int(re.findall("\d+", load_path)[1])
    
    print(count)
    return kogpt2model, count


if __name__ == "__main__":
    tok_path = get_tokenizer()
    model, vocab = get_pytorch_kogpt2_model()
    tok = SentencepieceTokenizer(tok_path,  num_best=0, alpha=0)

    PATH = '../model/pretrained_kogpt2.pth'
    torch.save(model.state_dict(), PATH)
예제 #27
0
import requests
import traceback

import numpy as np
from flask import request
from flask import Flask, jsonify
from tensorflow.keras.preprocessing.sequence import pad_sequences

from config import get_args
from utils import get_tokenizer

app = Flask(__name__)
app.config["JSON_AS_ASCII"] = False

args = get_args()
tokenizer = get_tokenizer(args['bert_model_name'], args['pretrain_model_path'])


def get_model_data(sentence, tokenizer, max_seq_len=128):
    dataset_dict = {
        "input_ids": [],
        "attention_mask": [],
    }

    input_ids = tokenizer.encode(
        sentence,  # Sentence to encode.
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=max_seq_len,  # Truncate all sentences.
    )
    sentence_length = len(input_ids)
    input_ids = pad_sequences([input_ids],
예제 #28
0
    config = json.load(f)

model_config = config["model"]
pretraining_config = config["pretraining_setting"]
gpu_config = config["gpu_setting"]
checkpoint_dir = config["model_checkpoints"]
glue_dataset_folder = config["glue_dataset_folder"]

device_ids = list(range(torch.cuda.device_count()))
print(f"GPU list: {device_ids}")

print(json.dumps([model_config, pretraining_config], indent=4))

########################### Loading Datasets ###########################

tokenizer = utils.get_tokenizer(model_config["max_seq_len"])
model_config["vocab_size"] = len(tokenizer.get_vocab())

data_args = GlueDataTrainingArguments(
    task_name=args.task,
    data_dir=os.path.join(glue_dataset_folder, args.task),
    max_seq_length=model_config["max_seq_len"],
    overwrite_cache=True)
train_dataset = GlueDataset(data_args, tokenizer=tokenizer)
data_loader = DataLoader(train_dataset,
                         batch_size=args.batch_size,
                         shuffle=True,
                         collate_fn=default_data_collator)
num_steps_per_epoch = len(data_loader)
print(f"num_steps_per_epoch: {num_steps_per_epoch}", flush=True)
예제 #29
0
파일: data.py 프로젝트: liyucheng09/lyc
    @classmethod
    def get_true_length(cls, examples):
        assert cls.tokenizer is not None
        print(f'Tokenizer_type: {cls.tokenizer.name_or_path}, should check the n_real method.')
        examples['n'] = [sum(i) - 2 for i in examples['attention_mask']]
        examples['n_real'] = [sum([0 if cls.tokenizer.convert_ids_to_tokens(i).startswith('##') 
                            else 1 for i in line]) - 2 for line in examples['input_ids']]
        return examples


if __name__ == '__main__':
    from utils import get_tokenizer
    from copy import deepcopy

    t=get_tokenizer('bert-base-chinese', is_zh=True)
    ds = get_tokenized_ds('hfds_scripts/atec_dataset.py', '../sentence-embedding/data/ATEC/atec_nlp_sim_train.csv', t, tokenize_type='with_prefix')

    ds = ds['atec']
    ds2=deepcopy(ds)

    for index, ds_ in enumerate([ds, ds2]):
        features=list(ds_.features)
        for feature in features:
            if index:
                if feature.startswith('textb') or feature == 'label':
                    ds_.remove_columns_(feature)
                else:
                    ds_.rename_column_(feature, feature[6:])
            else:
                if feature.startswith('texta') or feature == 'label':
예제 #30
0
data_folder = os.path.join(curr_path, "datasets", config["data_folder"])

if args.batch_size is not None:
    pretraining_config["batch_size"] = args.batch_size

if args.num_batch is not None:
    pretraining_config["validate_batches_per_epoch"] = args.num_batch

device_ids = list(range(torch.cuda.device_count()))
print(f"GPU list: {device_ids}")

print(json.dumps([model_config, pretraining_config], indent=4))

########################### Loading Dataset ###########################

tokenizer = utils.get_tokenizer(os.path.join(curr_path, 'roberta-base'),
                                model_config["max_seq_len"])
model_config["vocab_size"] = len(tokenizer.get_vocab())

if "dataset" not in config:
    config["dataset"] = None

dataset = CorpusDataset(folder_path=data_folder,
                        file_json="dev.json",
                        option=config["dataset"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)
data_loader = DataLoader(dataset,
                         batch_size=pretraining_config["batch_size"],
                         collate_fn=data_collator)
pretrain_dataloader_iter = enumerate(data_loader)