예제 #1
0
def predict(s):
    pre_toks = tokenizer.tokenize(s)
    max_seq_len = 128

    with open('tmp', 'w') as f:
        f.write('\t'.join(
            ['na', ' '.join(pre_toks), ' '.join(pre_toks), 'na', 'na']))

    dl, _ = get_dataloader('tmp', tok2id, 1)
    for batch in dl:
        (pre_id, pre_mask, pre_len, post_in_id, post_out_id, pre_tok_label_id,
         _, rel_ids, pos_ids, categories) = batch

        post_start_id = tok2id['行']
        max_len = min(max_seq_len, pre_len[0].detach().cpu().numpy() + 10)

        with torch.no_grad():
            predicted_toks, predicted_probs = joint_model.inference_forward(
                pre_id,
                post_start_id,
                pre_mask,
                pre_len,
                max_len,
                pre_tok_label_id,
                rel_ids=rel_ids,
                pos_ids=pos_ids,
                categories=categories,
                beam_width=1)

        # print(predicted_toks); quit()
        pred_seq = [id2tok[x] for x in predicted_toks[0][1:]]
        if '止' in pred_seq:
            pred_seq = pred_seq[:pred_seq.index('止')]
        pred_seq = ' '.join(pred_seq).replace('[PAD]', '').strip()
        pred_seq = pred_seq.replace(' ##', '')
        return pred_seq
예제 #2
0
import model as joint_model
import utils as joint_utils

assert ARGS.inference_output, "Need to specify inference_output arg!"

# # # # # # # # ## # # # ## # # DATA # # # # # # # # ## # # # ## # #
tokenizer = BertTokenizer.from_pretrained(ARGS.bert_model,
                                          cache_dir=ARGS.working_dir +
                                          '/cache')
tok2id = tokenizer.vocab
tok2id['<del>'] = len(tok2id)

eval_dataloader, num_eval_examples = get_dataloader(
    ARGS.test,
    tok2id,
    ARGS.test_batch_size,
    ARGS.working_dir + '/test_data.pkl',
    test=True,
    add_del_tok=ARGS.add_del_tok)

# # # # # # # # ## # # # ## # # MODEL # # # # # # # # ## # # # ## # #

if ARGS.pointer_generator:
    debias_model = seq2seq_model.PointerSeq2Seq(
        vocab_size=len(tok2id),
        hidden_size=ARGS.hidden_size,
        emb_dim=768,
        dropout=0.2,
        tok2id=tok2id)  # 768 = bert hidden size
else:
    debias_model = seq2seq_model.Seq2Seq(vocab_size=len(tok2id),
예제 #3
0
sys.path.append('.')
from shared.data import get_dataloader
from shared.args import ARGS

if not os.path.exists(ARGS.working_dir):
    os.makedirs(ARGS.working_dir)

print('LOADING DATA...')
tokenizer = BertTokenizer.from_pretrained(ARGS.bert_model,
                                          cache_dir=ARGS.working_dir +
                                          '/cache')
tok2id = tokenizer.vocab
tok2id['<del>'] = len(tok2id)

train_dataloader, num_train_examples = get_dataloader(
    ARGS.train, tok2id, ARGS.train_batch_size, ARGS.max_seq_len,
    ARGS.working_dir + '/train_data.pkl')
eval_dataloader, num_eval_examples = get_dataloader(ARGS.test,
                                                    tok2id,
                                                    ARGS.test_batch_size,
                                                    ARGS.max_seq_len,
                                                    ARGS.working_dir +
                                                    '/test_data.pkl',
                                                    test=True)

featurizer = Featurizer(tok2id)


def data_for_scipy(dataloader, by_seq=False):
    outX = []
    outY = []
예제 #4
0
writer = SummaryWriter(ARGS.working_dir)

# # # # # # # # ## # # # ## # # DATA # # # # # # # # ## # # # ## # #
tokenizer = BertTokenizer.from_pretrained(ARGS.bert_model,
                                          cache_dir=ARGS.working_dir +
                                          '/cache')
tok2id = tokenizer.vocab

tok2id['<del>'] = len(tok2id)
print("Vocab size: {}".format(len(tok2id)))

if ARGS.pretrain_data:
    print("Loading pretrain data...")
    pretrain_dataloader, num_pretrain_examples = get_dataloader(
        ARGS.pretrain_data,
        tok2id,
        ARGS.train_batch_size,
        ARGS.working_dir + '/pretrain_data.pkl',
        noise=True)

print("Loading train data...")
train_dataloader, num_train_examples = get_dataloader(
    ARGS.train,
    tok2id,
    ARGS.train_batch_size,
    ARGS.working_dir + '/train_data.pkl',
    categories_path=ARGS.categories_file,
    add_del_tok=ARGS.add_del_tok)
print("Loading eval data...")
eval_dataloader, num_eval_examples = get_dataloader(
    ARGS.test,
    tok2id,
예제 #5
0
with open(ARGS.working_dir + '/command.sh', 'w') as f:
    f.write('python' + ' '.join(sys.argv) + '\n')

# # # # # # # # ## # # # ## # # DATA # # # # # # # # ## # # # ## # #

print('LOADING DATA...')
tokenizer = BertTokenizer.from_pretrained(ARGS.bert_model,
                                          cache_dir=ARGS.working_dir +
                                          '/cache')
tok2id = tokenizer.vocab
tok2id['<del>'] = len(tok2id)

print(ARGS)
train_dataloader, num_train_examples = get_dataloader(
    ARGS.train,
    tok2id,
    ARGS.train_batch_size,
    ARGS.working_dir + '/train_data.pkl',
    categories_path=ARGS.categories_file)
eval_dataloader, num_eval_examples = get_dataloader(
    ARGS.test,
    tok2id,
    ARGS.test_batch_size,
    ARGS.working_dir + '/test_data.pkl',
    test=True,
    categories_path=ARGS.categories_file)

# # # # # # # # ## # # # ## # # MODEL # # # # # # # # ## # # # ## # #

print('BUILDING MODEL...')
if ARGS.extra_features_top:
    model = tagging_model.BertForMultitaskWithFeaturesOnTop.from_pretrained(
예제 #6
0
    TEST_BATCH_SIZE = 16
else:
    TRAIN_BATCH_SIZE = ARGS.train_batch_size
    TEST_BATCH_SIZE = ARGS.test_batch_size // ARGS.beam_width

# # # # # # # # ## # # # ## # # DATA # # # # # # # # ## # # # ## # #
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL,
                                          cache_dir=ARGS.working_dir +
                                          '/cache')
tok2id = tokenizer.vocab
tok2id['<del>'] = len(tok2id)

if ARGS.pretrain_data:
    pretrain_dataloader, num_pretrain_examples = get_dataloader(
        ARGS.pretrain_data,
        tok2id,
        TRAIN_BATCH_SIZE,
        ARGS.working_dir + '/pretrain_data.pkl',
        noise=True)

train_dataloader, num_train_examples = get_dataloader(
    ARGS.train,
    tok2id,
    TRAIN_BATCH_SIZE,
    ARGS.working_dir + '/train_data.pkl',
    add_del_tok=ARGS.add_del_tok)
eval_dataloader, num_eval_examples = get_dataloader(
    ARGS.test,
    tok2id,
    TEST_BATCH_SIZE,
    ARGS.working_dir + '/test_data.pkl',
    test=True,
예제 #7
0
BERT_MODEL = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL,
                                          cache_dir=ARGS.working_dir +
                                          '/cache')
tok2id = tokenizer.vocab
tok2id['<del>'] = len(tok2id)

featurizer = Featurizer(tok2id)
feature_names = featurizer.get_feature_names()

TARGET_VARIABLE = 'entailing'
tgt_idx = feature_names.index(TARGET_VARIABLE)

dataloader, num_pretrain_examples = get_dataloader(
    ARGS.train, tok2id, ARGS.train_batch_size,
    ARGS.working_dir + '/pretrain_data.pkl')

feature_counts = defaultdict(
    lambda: {
        'n00': 1.,  # docs without term, 0 label
        'n01': 1.,  # docs without term, 1 label
        'n10': 1.,  # docs with term, 0 label
        'n11': 1.  # docs with term, 1 label
    })

for batch in tqdm(dataloader):
    (pre_id, pre_mask, pre_len, post_in_id, post_out_id, pre_tok_label_id,
     post_tok_label_id, rel_ids, pos_ids, categories) = batch

    features = featurizer.featurize_batch(pre_id.detach().cpu().numpy(),