예제 #1
0
    # load the bert setting
    if 'albert' not in args.bert_config_file:
        bert_config = BertConfig.from_json_file(args.bert_config_file)
    else:
        if 'google' in args.bert_config_file:
            bert_config = AlbertConfig.from_json_file(args.bert_config_file)
        else:
            bert_config = ALBertConfig.from_json_file(args.bert_config_file)

    # load data
    print('loading data...')
    tokenizer = tokenization.BertTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
    assert args.vocab_size == len(tokenizer.vocab)

    if not os.path.exists(args.test_dir1) or not os.path.exists(args.test_dir2):
        json2features(args.test_file, [args.test_dir1, args.test_dir2], tokenizer, is_training=False,
                      max_seq_length=args.max_seq_length)

    if not os.path.exists(args.test_dir1):
        json2features(input_file=args.test_file, output_files=[args.test_dir1, args.test_dir2],
                      tokenizer=tokenizer, is_training=False, repeat_limit=3, max_query_length=96,
                      max_seq_length=args.max_seq_length, doc_stride=128)
    test_examples = json.load(open(args.test_dir1, 'r'))
    test_features = json.load(open(args.test_dir2, 'r'))

    dev_steps_per_epoch = len(test_features) // args.n_batch
    if len(test_features) % args.n_batch != 0:
        dev_steps_per_epoch += 1

    # init model
    print('init model...')
    if 'albert' not in args.init_restore_dir:
예제 #2
0
        bert_config = BertConfig.from_json_file(args.bert_config_file)
    else:
        if 'google' in args.bert_config_file:
            bert_config = AlbertConfig.from_json_file(args.bert_config_file)
        else:
            bert_config = ALBertConfig.from_json_file(args.bert_config_file)

    # load data
    print('loading data...')
    tokenizer = tokenization.BertTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=True)
    assert args.vocab_size == len(tokenizer.vocab)
    if not os.path.exists(args.train_dir):
        json2features(args.train_file, [
            args.train_dir.replace('_features_', '_examples_'), args.train_dir
        ],
                      tokenizer,
                      is_training=True,
                      max_seq_length=args.max_seq_length)

    if not os.path.exists(args.dev_dir1) or not os.path.exists(args.dev_dir2):
        json2features(args.dev_file, [args.dev_dir1, args.dev_dir2],
                      tokenizer,
                      is_training=False,
                      max_seq_length=args.max_seq_length)

    train_features = json.load(open(args.train_dir, 'r'))
    dev_examples = json.load(open(args.dev_dir1, 'r'))
    dev_features = json.load(open(args.dev_dir2, 'r'))
    if os.path.exists(args.log_file):
        os.remove(args.log_file)
예제 #3
0
    # load the bert setting
    if 'albert' not in args.bert_config_file:
        bert_config = BertConfig.from_json_file(args.bert_config_file)
    else:
        bert_config = ALBertConfig.from_json_file(args.bert_config_file)

    # load data
    print('loading data...')
    tokenizer = tokenization.BertTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=True)
    assert args.vocab_size == len(tokenizer.vocab)
    if not os.path.exists(args.train_dir):
        json2features(args.train_file, [
            args.train_dir.replace('_features_', '_examples_'), args.train_dir
        ],
                      tokenizer,
                      is_training=True,
                      max_seq_length=bert_config.max_position_embeddings)

    if not os.path.exists(args.dev_dir1) or not os.path.exists(args.dev_dir2):
        json2features(args.dev_file, [args.dev_dir1, args.dev_dir2],
                      tokenizer,
                      is_training=False,
                      max_seq_length=bert_config.max_position_embeddings)

    train_features = json.load(open(args.train_dir, 'r'))
    dev_examples = json.load(open(args.dev_dir1, 'r'))
    dev_features = json.load(open(args.dev_dir2, 'r'))
    if os.path.exists(args.log_file):
        os.remove(args.log_file)
예제 #4
0
    # load the bert setting
    if 'albert' not in args.bert_config_file:
        bert_config = BertConfig.from_json_file(args.bert_config_file)
    else:
        bert_config = ALBertConfig.from_json_file(args.bert_config_file)

    # load data
    print('loading data...')
    tokenizer = tokenization.BertTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=True)
    assert args.vocab_size == len(tokenizer.vocab)

    if not os.path.exists(args.test_dir1) or not os.path.exists(
            args.test_dir2):
        json2features(args.test_file, [args.test_dir1, args.test_dir2],
                      tokenizer,
                      is_training=False,
                      max_seq_length=bert_config.max_position_embeddings)

    test_examples = json.load(open(args.test_dir1, 'r'))
    test_features = json.load(open(args.test_dir2, 'r'))

    dev_steps_per_epoch = len(test_features) // args.n_batch
    if len(test_features) % args.n_batch != 0:
        dev_steps_per_epoch += 1

    # init model
    print('init model...')
    if 'albert' not in args.init_restore_dir:
        model = BertForQuestionAnswering(bert_config)
    else:
        model = ALBertForQA(bert_config, dropout_rate=args.dropout)