Пример #1
0
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, my_pretrain_bert):
    # bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
    # vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    # init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')

    # bert_config = BertConfig.from_json_file(bert_config_file)
    # tokenizer = tokenization.FullTokenizer(
    #     vocab_file=vocab_file, do_lower_case=do_lower_case)
    # bert_config.print_status()

    # model_bert = BertModel(bert_config)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=args.do_lower_case)
    model_bert, bert_config = BertModel.from_pretrained('bert-base-uncased')

    if my_pretrain_bert:
        model_bert.load_state_dict(
            torch.load(init_checkpoint, map_location='cpu'))
        print("Load pre-trained parameters.")
    else:
        pass
    model_bert.to(device)

    return model_bert, tokenizer, bert_config
def get_data_iterators_yelp(train_lm=False, map_cpu=False):
    text_field = tt.data.Field(lower=args.lower)
    label_field = tt.data.LabelField(sequential=False, unk_token=None)
    length_field = tt.data.Field(sequential=False, use_vocab=False)
    offset_field = tt.data.Field(sequential=False, use_vocab=False)

    path_format = './.data/yelp_review_polarity_csv/%s.csv.token'
    bert_tokenizer = None
    if args.use_bert_tokenizer:
        bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir='bert/cache')
    train_examples, test_examples = (get_examples_yelp(path_format % ds, train_lm, bert_tokenizer=bert_tokenizer) for
                                     ds in ['train', 'test'])
    dev_examples = test_examples[:500]
    train, dev, test = (tt.data.Dataset(ex, [('text', text_field), ('length', length_field), ('offset', offset_field), ('label',label_field)])
                        for ex in [train_examples, dev_examples, test_examples])

    vocab_path = 'vocab/vocab_yelp.pkl' if not args.use_bert_tokenizer else 'vocab/vocab_yelp_bert.pkl'
    if args.fix_test_vocab and not args.use_bert_tokenizer:
        vocab_path = 'vocab/vocab_yelp_fix.pkl'

    c_postfix = '.yelp'
    if args.use_bert_tokenizer:
        c_postfix += '.bert'
    if args.fix_test_vocab:
        c_postfix += '.fix'
    handle_vocab(vocab_path, text_field, (train, test), args.vector_cache + c_postfix, train_lm, max_size=20000)
    label_field.build_vocab(train)
    train_iter, dev_iter, test_iter = (
        tt.data.BucketIterator(x, batch_size=args.batch_size, device=args.gpu if not map_cpu else 'cpu', shuffle=False)
        for x in (train, dev, test))
    return text_field, label_field, train_iter, dev_iter, test_iter, train, dev
Пример #3
0
    def __init__(self,
                 sample_list,
                 max_query_length,
                 max_seq_length,
                 train_flag=False,
                 device=None):
        super(BertRCDataset, self).__init__(sample_list, device)
        self.max_query_length = max_query_length
        self.max_seq_length = max_seq_length
        self.tokenizer = BertTokenizer(
            '%s/vocab.txt' % ('./pretrained/chinese_wwm_ext_pytorch'))
        self.cvt = BertInputConverter(self.tokenizer)
        self.train_flag = train_flag
        self.add_bert_fields()

        if train_flag:
            self.sample_list = [
                d for d in self.sample_list if len(d['char_spans']) == 1
            ]
        for sample in self.sample_list:
            tmp = self.cvt.convert(sample['question'],
                                   sample['passage'],
                                   self.max_query_length,
                                   self.max_seq_length,
                                   to_tensor=False)
            (input_ids, input_mask,
             segment_ids) = tmp['input'], tmp['att_mask'], tmp['seg']
            sample.update({
                'input_ids': input_ids,
                'input_mask': input_mask,
                'segment_ids': segment_ids
            })
            if train_flag:
                ss, se = sample['char_spans'][0]
                sample['bert_span'] = tmp['pos_map'][ss], tmp['pos_map'][se]
def get_data_iterators_sst_flatten(train_lm=False, map_cpu=False):
    text_field = tt.data.Field(lower=args.lower)
    length_field = tt.data.Field(sequential=False, use_vocab=False)
    offset_field = tt.data.Field(sequential=False, use_vocab=False)
    _, _, _ = tt.datasets.SST.splits(text_field, length_field, fine_grained=False, train_subtrees=False,
                                                     filter_pred=lambda ex: ex.label != 'neutral')

    path_format = './.data/sst/trees/%s.txt'

    bert_tokenizer = None
    if args.use_bert_tokenizer:
        bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir='bert/cache')

    train_ex, dev_ex, test_ex = (get_examples_sst(path_format % ds, train_lm, bert_tokenizer=bert_tokenizer)
                                 for ds in ['train', 'dev', 'test'])
    train, dev, test = (tt.data.Dataset(ex, [('text', text_field), ('length', length_field), ('offset', offset_field)])
                        for ex in [train_ex, dev_ex, test_ex])

    vocab_path = 'vocab/vocab_sst.pkl' if not args.use_bert_tokenizer else 'vocab/vocab_sst_bert.pkl'
    c_postfix = '.sst'
    if args.use_bert_tokenizer:
        c_postfix += '.bert'
    handle_vocab(vocab_path, text_field, (train, dev, test), args.vector_cache + c_postfix, train_lm)

    train_iter, dev_iter, test_iter = (
    tt.data.BucketIterator(x, batch_size=args.batch_size, device=args.gpu if not map_cpu else 'cpu', shuffle=False)
        for x in (train, dev, test))
    return text_field, length_field, train_iter, dev_iter, test_iter, train, dev
Пример #5
0
def test_evaluate_on_file():
    BERT_SERIALIZATION_DIR = './pretrained/chinese_wwm_ext_pytorch'  
    tokenizer = BertTokenizer('%s/vocab.txt'%(BERT_SERIALIZATION_DIR))
    device = torch.device('cpu')
    num_fn = functools.partial(generate_bert_pointwise_input,max_seq_len=200,max_passage_len=100,tokenizer=tokenizer,device=device)

    fake_model1 = lambda x,y,z:   [[0,1] for _ in range(len(x))]
    fake_model2 = lambda x,y,z:   [[1,0] for _ in range(len(x))]
    fake_model3 = lambda x,y,z:   [[random.choice([0,1]),random.choice([0,1])]for _ in range(len(x))]
    fake_model4 = lambda x,y,z:   [[random.uniform(0,1),random.choice([0,1])] for _ in range(len(x))]

    fake_model1.eval = lambda :None
    fake_model2.eval = lambda :None
    fake_model3.eval = lambda :None
    fake_model4.eval = lambda :None

    test_path = './data/demo/devset/search.dev.2.json'
    results1 = evaluate_on_file(test_path,fake_model1,num_fn,[('accuracy',accuracy),('precision',precision)])
    results2 = evaluate_on_file(test_path,fake_model2,num_fn,[('accuracy',accuracy),('precision',precision)])
    results3 = evaluate_on_file(test_path,fake_model3,num_fn,[('accuracy',accuracy),('precision',precision)])
    results4 = evaluate_on_file(test_path,fake_model4,num_fn,[('precision',precision),('precision2',functools.partial(precision,k=2))])
    X,y = load_examples_from_scratch(test_path,concat=False,attach_label='most_related_para')
    assert results1['accuracy'] == sum(y)/len(y)
    assert results2['accuracy'] == (len(y)-sum(y))/len(y)

    assert precision([[-1,1],[0,2],[0,3],[-1,-1]],[0,1,0,0],k=2)==0.5
    assert precision([[-1,1],[0,2],[0,3],[-1,-1]],[0,0,0,1],k=2)==0
    print(results3['accuracy'])
    print(results1['precision'])
    print(results2['precision'])
    print(results4['precision'])
    print(results4['precision2'])
Пример #6
0
def model_factory(bert_path, device=None, tokenizer=None, **kwargs):
    if device is None:
        device = get_default_device()
    if tokenizer is None:
        tokenizer = BertTokenizer('%s/vocab.txt' % (bert_path))
    model = BertForSequenceClassification.from_pretrained(bert_path,
                                                          num_labels=2,
                                                          **kwargs).to(device)

    return model, tokenizer, device
Пример #7
0
def data2network(data_struct, data_type, params):
    # input
    sent_words = data_struct['sentences']

    # words
    org_sent_words = sent_words['sent_words']
    sent_words = prep_sentences(sent_words, data_type, params)
    wordsIDs = _elem2idx(sent_words, params['mappings']['word_map'])

    all_sentences = []

    # nner: Using subwords:
    tokenizer = BertTokenizer.from_pretrained(params['bert_model'],
                                              do_lower_case=False)

    for xx, sid in enumerate(data_struct['input']):
        # input
        sentence_data = data_struct['input'][sid]

        # document id
        fid = sid.split(':')[0]

        # words to ids
        word_ids = wordsIDs[xx]
        words = org_sent_words[xx]

        # entity
        readable_e, idxs, ents, toks2, etypes2ids, entities, sw_sentence, sub_to_word, subwords, valid_starts, tagsIDs, terms = entity2network(
            sentence_data, words, params, tokenizer)

        # return
        sentence_vector = OrderedDict()
        sentence_vector['fid'] = fid
        sentence_vector['ents'] = ents
        sentence_vector['word_ids'] = word_ids
        sentence_vector['words'] = words
        sentence_vector['offsets'] = sentence_data['offsets']
        sentence_vector['e_ids'] = idxs
        sentence_vector['tags'] = tagsIDs
        sentence_vector['etypes2'] = etypes2ids
        sentence_vector['toks2'] = toks2
        sentence_vector['raw_words'] = sentence_data['words']

        sentence_vector['entities'] = entities
        sentence_vector['sw_sentence'] = sw_sentence
        sentence_vector['terms'] = terms
        sentence_vector['sub_to_word'] = sub_to_word
        sentence_vector['subwords'] = subwords
        sentence_vector['valid_starts'] = valid_starts

        all_sentences.append(sentence_vector)

    return all_sentences
Пример #8
0
def torch_data_2_network(cdata2network, params, do_get_nn_data):
    """ Convert object-type data to torch.tensor type data, aim to use with Pytorch
    """
    etypes = [data['etypes2'] for data in cdata2network]

    # nner
    entitiess = [data['entities'] for data in cdata2network]
    sw_sentences = [data['sw_sentence'] for data in cdata2network]
    termss = [data['terms'] for data in cdata2network]
    valid_startss = [data['valid_starts'] for data in cdata2network]

    fids = [data['fid'] for data in cdata2network]
    wordss = [data['words'] for data in cdata2network]
    offsetss = [data['offsets'] for data in cdata2network]
    sub_to_words = [data['sub_to_word'] for data in cdata2network]
    subwords = [data['subwords'] for data in cdata2network]

    tokenizer = BertTokenizer.from_pretrained(params['bert_model'],
                                              do_lower_case=False)

    # User-defined data
    if not params["predict"]:
        id_tag_mapping = params["mappings"]["nn_mapping"]["id_tag_mapping"]

        mlb = MultiLabelBinarizer()
        mlb.fit([sorted(id_tag_mapping)[1:]])  # [1:] skip label O

        params["mappings"]["nn_mapping"]["mlb"] = mlb
        params["mappings"]["nn_mapping"]["num_labels"] = len(mlb.classes_)

        params["max_span_width"] = max(params["max_entity_width"],
                                       params["max_trigger_width"])

        params["mappings"]["nn_mapping"]["num_triggers"] = len(
            params["mappings"]["nn_mapping"]["trigger_labels"])
        params["mappings"]["nn_mapping"]["num_entities"] = params["mappings"]["nn_mapping"]["num_labels"] - \
                                                           params["mappings"]["nn_mapping"]["num_triggers"]

    if do_get_nn_data:
        nn_data = get_nn_data(fids, entitiess, termss, valid_startss,
                              sw_sentences, tokenizer, params)

        return {
            'nn_data': nn_data,
            'etypes': etypes,
            'fids': fids,
            'words': wordss,
            'offsets': offsetss,
            'sub_to_words': sub_to_words,
            'subwords': subwords,
            'entities': entitiess
        }
Пример #9
0
def make_dataset(path_list):
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                              do_lower_case=True)
    cvt = BertInputConverter(tokenizer)

    def process_file(path):
        l = []
        with open(path, 'r', encoding='utf-8') as f:
            for line in tqdm(f.readlines()):
                l.extend(make_examples(json.loads(line.strip()), cvt))
        return l

    examples = []
    for path in path_list:
        examples.extend(process_file(path))
    dataset = Dataset(examples, FIELDS)
    return dataset
def main():

    ## data
    set_name = 'test'  # 'train', 'dev' or 'test'
    raw_data_file = 'PAIRS_FILE'  # contains query passage pairs, format: example_id \t query_text \t passage text (\t label) \n
    output_features_file = 'FEATURES_FILE'  # format: example_id,input_ids,input_mask,segment_ids,label

    ## prepare tokenizer
    bert_model_dir = 'BERT_MODEL_DIR'  # contains vocab.txt file.
    tokenizer = BertTokenizer.from_pretrained(bert_model_dir,
                                              do_lower_case=True)
    max_seq_length = 256

    # start tokenize
    tokenize_to_features(set_name, raw_data_file, output_features_file,
                         tokenizer, max_seq_length)
    logger.info('Convert to csv done!')
def get_data_iterators_tacred(train_lm=False, map_cpu=False):
    text_field = tt.data.Field(lower=False)
    label_field = tt.data.LabelField()
    length_field = tt.data.Field(sequential=False, use_vocab=False)
    offset_field = tt.data.Field(sequential=False, use_vocab=False)
    pos_field = tt.data.Field()
    ner_field = tt.data.Field()
    subj_offset_field = tt.data.Field()
    obj_offset_field = tt.data.Field()

    path_format = './.data/TACRED/data/json/%s.json'
    bert_tokenizer = None
    if args.use_bert_tokenizer:
        bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir='bert/cache')
    train_examples, dev_examples, test_examples = (get_examples_tacred(path_format % ds, train_lm, bert_tokenizer=bert_tokenizer) for
                                     ds in ['train', 'dev','test'])
    train, dev, test = (tt.data.Dataset(ex, [('text', text_field), ('length', length_field), ('offset', offset_field),
                                             ('label', label_field), ('subj_offset', subj_offset_field),
                                             ('obj_offset', obj_offset_field), ('ner', ner_field), ('pos', pos_field)])
                                            for ex in [train_examples, dev_examples, test_examples])

    vocab_path = 'vocab/vocab_tacred.pkl' if not args.use_bert_tokenizer else 'vocab/vocab_tacred_bert.pkl'
    if args.fix_test_vocab and not args.use_bert_tokenizer:
        vocab_path = 'vocab/vocab_tacred_fix.pkl'

    c_postfix = '.tacred'
    if args.use_bert_tokenizer:
        c_postfix += '.bert'
    if args.fix_test_vocab:
        c_postfix += '.fix'
    handle_vocab(vocab_path, text_field, (train, dev, test), args.vector_cache + c_postfix, train_lm, max_size=100000)
    handle_vocab(vocab_path, text_field, (train, dev, test), args.vector_cache + c_postfix, train_lm, max_size=100000)
    handle_vocab(vocab_path + '.relation', label_field, (train, dev, test), '', False, None)
    handle_vocab(vocab_path + '.subj_offset', subj_offset_field, (train, dev, test), '', False, None)
    handle_vocab(vocab_path + '.obj_offset', obj_offset_field, (train, dev, test), '', False, None)
    handle_vocab(vocab_path + '.pos', pos_field, (train, dev, test), '', False, None)
    handle_vocab(vocab_path + '.ner', ner_field, (train, dev, test), '', False, None)

    train_iter, dev_iter, test_iter = (
        tt.data.BucketIterator(x, batch_size=args.batch_size, device=args.gpu if not map_cpu else 'cpu')
        for x in (train, dev, test))
    return text_field, label_field, subj_offset_field, obj_offset_field, pos_field, ner_field, train_iter, dev_iter, test_iter, train, dev
Пример #12
0
def bertTokenizer(*args, **kwargs):
    """
    Instantiate a BertTokenizer from a pre-trained/customized vocab file
    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * bert-base-uncased
                                       * bert-large-uncased
                                       * bert-base-cased
                                       * bert-large-cased
                                       * bert-base-multilingual-uncased
                                       * bert-base-multilingual-cased
                                       * bert-base-chinese
    Keyword args:
    cache_dir: an optional path to a specific directory to download and cache
               the pre-trained model weights.
               Default: None
    do_lower_case: Whether to lower case the input.
                   Only has an effect when do_wordpiece_only=False
                   Default: True
    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
                       Default: True
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
             Default: None
    never_split: List of tokens which will never be split during tokenization.
                 Only has an effect when do_wordpiece_only=False
                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]

    Example:
        >>> sentence = 'Hello, World!'
        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT:hubconf', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
        >>> toks = tokenizer.tokenize(sentence)
        ['Hello', '##,', 'World', '##!']
        >>> ids = tokenizer.convert_tokens_to_ids(toks)
        [8667, 28136, 1291, 28125]
    """
    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
Пример #13
0
def test_2():
    datapath = './data/demo/devset/search.dev.json'
    stg = sample_strategy_factory('trivial_n',k=1)
    examples,labels = load_examples_from_scratch(datapath,stg)
    #examples,labels  = load_examples_from_scratch(datapath,attach_label='most_related_para')
    #for (q,p),label in zip(examples[0:20],labels[0:20]):
    #    print(q)
    #    print(p[0:50])
    #    print(label)
    #    print('##'*10)
    # 
    #print(len(examples))    
    #print(examples[0:10])
    #print(labels[0:10])

    examples = load_examples_from_scratch(datapath,None)
    #print(len(examples))
    examples = load_examples_from_scratch(datapath,stg,concat=True)

    BERT_SERIALIZATION_DIR = './pretrained/chinese_wwm_ext_pytorch'  
    tokenizer = BertTokenizer('%s/vocab.txt'%(BERT_SERIALIZATION_DIR))
    device = torch.device('cpu')
    
    num_fn = functools.partial(generate_bert_pointwise_input,max_seq_len=200,max_passage_len=100,tokenizer=tokenizer,device=device)
    
    fake_examples = [('你好嗎','歐巴馬撿到了300快'),('我不好啦','歐巴馬撿到了槍 高雄發大財了'),('哈哈哈','猜猜我是誰')]
    X = generate_bert_pointwise_input(fake_examples,20,7,tokenizer,device)
    for a,b,c in X:
        print('%d'%(a.shape))
        print('- - - '*18)
    #print(X)
    #print(examples[0:2])
    bt = BatchIter(examples,16,num_fn)
    for batch,y in bt:
        print(batch[0][0].shape)
        print(batch[1][1].shape)
        print(y.shape)
Пример #14
0
    def __init__(self,
                 sample_list,
                 bert_path,
                 max_passage_len,
                 max_seq_length,
                 device=None):
        super(BertRankDataset, self).__init__(sample_list, device)
        self.add_bert_fields()
        self.tokenizer = BertTokenizer('%s/vocab.txt' % (bert_path))
        self.max_seq_length = max_seq_length
        self.max_passage_len = max_passage_len
        #_num_fn = numeralize_fucntion_factory(config.NUM_FN_NAME)
        self.numeralize_fn = functools.partial(generate_bert_pointwise_input,max_seq_len=self.max_seq_length,max_passage_len=self.max_passage_len,\
            tokenizer=self.tokenizer,device=self.device,wrap_tensor_flag=False)

        examples = [(sample['question'], sample['passage'])
                    for sample in self.sample_list]
        bert_input_t, seg_ids_t, input_mask_t = self.numeralize_fn(examples)
        for i, sample in enumerate(self.sample_list):
            sample.update({
                'input_ids': bert_input_t[i],
                'input_mask': input_mask_t[i],
                'segment_ids': seg_ids_t[i]
            })
Пример #15
0
 def __init__(self, config, decoder_dict=None, eval_flag=True, device=None):
     self.config = config
     if device is None:
         self.device = get_default_device()
     bert_config_path = '%s/bert_config.json' % (
         config.BERT_SERIALIZATION_DIR)
     self.model = load_bert_rc_model(bert_config_path, config.MODEL_PATH,
                                     self.device)
     self.model.load_state_dict(
         torch.load(config.MODEL_PATH, map_location=self.device))
     self.model = self.model.to(self.device)
     if eval_flag:
         self.model.eval()
     #bert-base-chinese
     self.tokenizer = BertTokenizer('%s/vocab.txt' %
                                    (config.BERT_SERIALIZATION_DIR),
                                    do_lower_case=True)
     if decoder_dict is None:
         self.decoder = MrcDecoderFactory.from_dict({
             'class': 'default',
             'kwargs': {}
         })
     else:
         self.decoder = MrcDecoderFactory.from_dict(decoder_dict)
Пример #16
0
#        for y in type2son[x]:
#            t[type2id[y]] = alpha
#        prior_numpy[:, type2id[x]] = t
#    return prior_numpy
#
#prior = torch.from_numpy(create_prior())
#tune = torch.from_numpy(np.transpose(create_prior(args.hierarchy_alpha)))

logger.info('load bert and ernie tokenizer')
ernie_tokenizer_label = ErnieTokenizer_label.from_pretrained(
    'ernie_base/', do_lower_case=args.bert_low_case)

ernie_tokenizer = ErnieTokenizer.from_pretrained(
    'ernie_base/', do_lower_case=args.bert_low_case)

bert_tokenizer = BertTokenizer.from_pretrained(
    'bert_large/', do_lower_case=args.bert_low_case)


# dataset for open type
# left context token + mention_span + right_context_token
class OpenDataset(data.Dataset):
    def __init__(self, path):
        entries = open(path, 'r').read().strip().splitlines()
        self.left_context, self.right_context, self.mention_span, self.labels = [], [], [], []

        def trans(x):
            return x[x.rfind('/') + 1:]

        for entry in entries:
            entry = dict(eval(entry))
            ys = entry['y_str']
Пример #17
0
def main(args):
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info(
                "16-bits training currently not supported in distributed training"
            )
            args.fp16 = False  # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_eval` or `do_predict` must be True."
        )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        #raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
        logger.warn('Output directory {} already exists.'.format(
            args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    tokenizer = BertTokenizer(args.vocab, do_lower_case=args.do_lower_case)
    dprd_task = DPRDTask(tokenizer)

    eval_data = dprd_task.get_dev_dataset(args.data_dir,
                                          128,
                                          input_type=args.tasks)
    if args.wnli_data:
        wnli_task = WNLITask(tokenizer)
        wnli_data = wnli_task.get_dev_dataset(args.wnli_data,
                                              128,
                                              input_type=args.tasks)
        eval_data += wnli_data
    if args.wsc273_data:
        wsc273_task = WSC273Task(tokenizer)
        wsc273_data = wsc273_task.get_dev_dataset(args.wsc273_data,
                                                  128,
                                                  input_type=args.tasks)
        eval_data += wsc273_data

    if args.gap_data:
        gap_task = GAPTask(tokenizer)
        gap_data = gap_task.get_dev_dataset(args.gap_data,
                                            384,
                                            input_type=args.tasks)
        eval_data += gap_data

    logger.info("  Evaluation batch size = %d", args.eval_batch_size)
    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_data = build_training_data_mt(args, tokenizer)

        total_examples = len(train_data)
        num_train_steps = int(total_examples / args.train_batch_size /
                              args.gradient_accumulation_steps *
                              args.num_train_epochs)
        logger.info("  Training batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

    model = create_model(args, 2, device)

    if args.do_train:
        train_model(args, device, n_gpu, model, train_data, eval_data,
                    num_train_steps)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        run_eval(args, model.eval(), device, eval_data, prefix=args.tag)

    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        test_data = dprd_task.get_test_dataset(args.data_dir,
                                               128,
                                               input_type=args.tasks)
        if args.wnli_data:
            wnli_data = wnli_task.get_test_dataset(args.wnli_data,
                                                   128,
                                                   input_type=args.tasks)
            test_data += wnli_data
        if args.wsc273_data:
            wsc273_data = wsc273_task.get_test_dataset(args.wsc273_data,
                                                       128,
                                                       input_type=args.tasks)
            test_data += wsc273_data
        logger.info("  Prediction batch size = %d", args.predict_batch_size)
        run_predict(args, model, device, test_data, prefix=args.tag)
import json
import pickle
from model.utils import Vocab
from bert.tokenization import BertTokenizer

with open('experiment/config.json') as f:
    params = json.loads(f.read())

# loading BertTokenizer
ptr_tokenizer = BertTokenizer.from_pretrained('bert/vocab.korean.rawtext.list',
                                              do_lower_case=False)
idx_to_token = list(ptr_tokenizer.vocab.keys())

# generate vocab
token_vocab = Vocab(idx_to_token,
                    padding_token='[PAD]',
                    unknown_token='[UNK]',
                    bos_token=None,
                    eos_token=None,
                    reserved_tokens=['[CLS]', '[SEP]', '[MASK]'],
                    unknown_token_idx=1)

# save vocab
token_vocab_path = params['filepath'].get('token_vocab')
with open(token_vocab_path, 'wb') as f:
    pickle.dump(token_vocab, f)
Пример #19
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        help="The output directory where the model checkpoints will be written."
    )

    parser.add_argument("--train_file", default=None, type=str)
    parser.add_argument("--val_file", default=None, type=str)
    parser.add_argument("--test_file", default=None, type=str)
    parser.add_argument("--test_output", default=None, type=str)
    parser.add_argument("--label_vocab", default=None, type=str, required=True)
    parser.add_argument("--punc_set", default='PU', type=str)
    parser.add_argument("--has_confidence", action='store_true')
    parser.add_argument("--only_save_bert", action='store_true')

    parser.add_argument("--arc_space", default=512, type=int)
    parser.add_argument("--type_space", default=128, type=int)

    parser.add_argument("--log_file", default=None, type=str)

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        action='store_true',
                        help="Whether to run predict on the test set.")
    parser.add_argument("--do_greedy_predict",
                        action='store_true',
                        help="Whether to run predict on the test set.")
    parser.add_argument("--do_ensemble_predict",
                        action='store_true',
                        help="Whether to run predict on the test set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--test_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for test.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.log_file is None:
        logging.basicConfig(
            format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
            datefmt='%m/%d/%Y %H:%M:%S',
            level=logging.INFO)
    else:
        logging.basicConfig(
            format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
            filename=args.log_file,
            filemode='w',
            datefmt='%m/%d/%Y %H:%M:%S',
            level=logging.INFO)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict and not args.do_greedy_predict and not args.do_ensemble_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        assert args.output_dir is not None

    if args.do_train and os.path.exists(args.output_dir) and os.listdir(
            args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if args.do_train and not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    label_vocab, label_vocab2idx = load_label_vocab(args.label_vocab)

    punc_set = set(
        args.punc_set.split(',')) if args.punc_set is not None else None

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        assert args.train_file is not None
        train_examples = read_conll_examples(
            args.train_file,
            is_training=True,
            has_confidence=args.has_confidence)

        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    if args.do_train or args.do_predict or args.do_greedy_predict:
        # load the pretrained model
        tokenizer = BertTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case)
        model = BertForDependencyParsing.from_pretrained(
            args.bert_model,
            cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                                   'distributed_{}'.format(args.local_rank)),
            arc_space=args.arc_space,
            type_space=args.type_space,
            num_labels=len(label_vocab))

        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif n_gpu > 1:
            model = torch.nn.DataParallel(model)

        #
        parser = model.module if hasattr(model, 'module') else model
    elif args.do_ensemble_predict:
        bert_models = args.bert_model.split(',')
        assert len(bert_models) > 1
        tokenizer = BertTokenizer.from_pretrained(
            bert_models[0], do_lower_case=args.do_lower_case)
        models = []
        for bm in bert_models:
            model = BertForDependencyParsing.from_pretrained(
                bm,
                cache_dir=os.path.join(
                    str(PYTORCH_PRETRAINED_BERT_CACHE),
                    'distributed_{}'.format(args.local_rank)),
                arc_space=args.arc_space,
                type_space=args.type_space,
                num_labels=len(label_vocab))
            model.to(device)
            model.eval()
            models.append(model)
        parser = models[0].module if hasattr(models[0],
                                             'module') else models[0]

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        # !!! NOTE why?
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)
        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    # start training loop
    if args.do_train:
        global_step = 0
        train_features = convert_examples_to_features(
            train_examples,
            tokenizer,
            args.max_seq_length,
            label_vocab2idx,
            True,
            has_confidence=args.has_confidence)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.float32)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_lengths = torch.tensor([f.seq_len for f in train_features],
                                   dtype=torch.long)
        all_heads = torch.tensor([f.heads for f in train_features],
                                 dtype=torch.long)
        all_labels = torch.tensor([f.labels for f in train_features],
                                  dtype=torch.long)

        if args.has_confidence:
            all_confidence = torch.tensor(
                [f.confidence for f in train_features], dtype=torch.float32)
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_lengths, all_heads,
                                       all_labels, all_confidence)
        else:
            train_data = TensorDataset(all_input_ids, all_input_mask,
                                       all_segment_ids, all_lengths, all_heads,
                                       all_labels)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        if args.do_eval:
            assert args.val_file is not None
            eval_examples = read_conll_examples(args.val_file,
                                                is_training=False,
                                                has_confidence=False)
            eval_features = convert_examples_to_features(eval_examples,
                                                         tokenizer,
                                                         args.max_seq_length,
                                                         label_vocab2idx,
                                                         False,
                                                         has_confidence=False)
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)

            all_example_ids = torch.tensor(
                [f.example_id for f in eval_features], dtype=torch.long)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                         dtype=torch.long)
            all_input_mask = torch.tensor(
                [f.input_mask for f in eval_features], dtype=torch.float32)
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)
            all_lengths = torch.tensor([f.seq_len for f in eval_features],
                                       dtype=torch.long)
            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_lengths,
                                      all_example_ids)

            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

        best_uas = 0
        best_las = 0
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            logger.info("Training epoch: {}".format(epoch))
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            model.train()
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                if args.has_confidence:
                    input_ids, input_mask, segment_ids, lengths, heads, label_ids, confidence = batch
                else:
                    confidence = None
                    input_ids, input_mask, segment_ids, lengths, heads, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, heads,
                             label_ids, confidence)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if global_step % 100 == 0:
                    logger.info("Training loss: {}, global step: {}".format(
                        tr_loss / nb_tr_steps, global_step))

            # we eval every epoch
            if args.do_eval and (args.local_rank == -1
                                 or torch.distributed.get_rank() == 0):
                logger.info("***** Running evaluation *****")

                model.eval()

                eval_predict_words, eval_predict_postags, eval_predict_heads, eval_predict_labels = [],[],[],[]

                for input_ids, input_mask, segment_ids, lengths, example_ids in tqdm(
                        eval_dataloader, desc="Evaluating"):
                    example_ids = example_ids.numpy()

                    batch_words = [
                        eval_features[eid].example.sentence
                        for eid in example_ids
                    ]
                    batch_postags = [
                        eval_features[eid].example.postags
                        for eid in example_ids
                    ]
                    batch_word_index = [
                        eval_features[eid].word_index for eid in example_ids
                    ]  # token -> word
                    batch_token_starts = [
                        eval_features[eid].token_starts for eid in example_ids
                    ]  # word -> token start
                    batch_heads = [
                        eval_features[eid].example.heads for eid in example_ids
                    ]

                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    heads = heads.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        # tmp_eval_loss = model(input_ids, segment_ids, input_mask, heads, label_ids)
                        energy = model(input_ids, segment_ids, input_mask)

                    heads_pred, labels_pred = parser.decode_MST(
                        energy.cpu().numpy(),
                        lengths.numpy(),
                        leading_symbolic=0,
                        labeled=True)

                    # we convert the subword dependency parsing to word dependency parsing just the word and token start map
                    pred_heads = []
                    pred_labels = []
                    for i in range(len(batch_word_index)):
                        word_index = batch_word_index[i]
                        token_starts = batch_token_starts[i]
                        hpd = []
                        lpd = []
                        for j in range(len(token_starts)):
                            if j == 0:  #[CLS]
                                continue
                            elif j == len(token_starts) - 1:  # [SEP]
                                continue
                            else:
                                hpd.append(
                                    word_index[heads_pred[i, token_starts[j]]])
                                lpd.append(
                                    label_vocab[labels_pred[i,
                                                            token_starts[j]]])
                        pred_heads.append(hpd)
                        pred_labels.append(lpd)

                    eval_predict_words += batch_words
                    eval_predict_postags += batch_postags
                    eval_predict_heads += pred_heads
                    eval_predict_labels += pred_labels

                eval_output_file = os.path.join(args.output_dir, 'eval.pred')

                write_conll_examples(eval_predict_words, eval_predict_postags,
                                     eval_predict_heads, eval_predict_labels,
                                     eval_output_file)

                eval_f = os.popen(
                    "python scripts/eval_nlpcc_dp.py " + args.val_file + " " +
                    eval_output_file, "r")
                result_text = eval_f.read().strip()
                logger.info("***** Eval results *****")
                logger.info(result_text)
                eval_f.close()
                eval_res = re.findall(
                    r'UAS = \d+/\d+ = ([\d\.]+), LAS = \d+/\d+ = ([\d\.]+)',
                    result_text)
                assert len(eval_res) > 0
                eval_res = eval_res[0]

                eval_uas = float(eval_res[0])
                eval_las = float(eval_res[1])

                # save model
                if best_las < eval_las or (eval_las == best_las
                                           and best_uas < eval_uas):
                    best_uas = eval_uas
                    best_las = eval_las

                    logger.info(
                        "new best uas  %.2f%% las %.2f%%, saving models.",
                        best_uas, best_las)

                    # Save a trained model, configuration and tokenizer
                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self

                    # If we save using the predefined names, we can load using `from_pretrained`
                    output_model_file = os.path.join(args.output_dir,
                                                     WEIGHTS_NAME)
                    output_config_file = os.path.join(args.output_dir,
                                                      CONFIG_NAME)

                    model_dict = model_to_save.state_dict()
                    if args.only_save_bert:
                        model_dict = {
                            k: v
                            for k, v in model_dict.items() if 'bert.' in k
                        }

                    torch.save(model_dict, output_model_file)
                    model_to_save.config.to_json_file(output_config_file)
                    tokenizer.save_vocabulary(args.output_dir)

    # start predict
    if args.do_predict:
        model.eval()
        assert args.test_file is not None
        test_examples = read_conll_examples(args.test_file,
                                            is_training=False,
                                            has_confidence=False)
        test_features = convert_examples_to_features(test_examples,
                                                     tokenizer,
                                                     args.max_seq_length,
                                                     label_vocab2idx,
                                                     False,
                                                     has_confidence=False)
        logger.info("***** Running prediction *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.test_batch_size)
        all_example_ids = torch.tensor([f.example_id for f in test_features],
                                       dtype=torch.long)
        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.float32)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_lengths = torch.tensor([f.seq_len for f in test_features],
                                   dtype=torch.long)

        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_lengths,
                                  all_example_ids)

        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.test_batch_size)

        test_predict_words, test_predict_postags, test_predict_heads, test_predict_labels = [],[],[],[]
        for batch_id, batch in enumerate(
                tqdm(test_dataloader, desc="Predicting")):
            input_ids, input_mask, segment_ids, lengths, example_ids = batch
            example_ids = example_ids.numpy()
            batch_words = [
                test_features[eid].example.sentence for eid in example_ids
            ]
            batch_postags = [
                test_features[eid].example.postags for eid in example_ids
            ]
            batch_word_index = [
                test_features[eid].word_index for eid in example_ids
            ]  # token -> word
            batch_token_starts = [
                test_features[eid].token_starts for eid in example_ids
            ]  # word -> token start

            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            lengths = lengths.numpy()

            with torch.no_grad():
                energy = model(input_ids, segment_ids, input_mask)

            heads_pred, labels_pred = parser.decode_MST(energy.cpu().numpy(),
                                                        lengths,
                                                        leading_symbolic=0,
                                                        labeled=True)

            pred_heads = []
            pred_labels = []
            for i in range(len(batch_word_index)):
                word_index = batch_word_index[i]
                token_starts = batch_token_starts[i]
                hpd = []
                lpd = []
                for j in range(len(token_starts)):
                    if j == 0:  #[CLS]
                        continue
                    elif j == len(token_starts) - 1:  # [SEP]
                        continue
                    else:
                        hpd.append(word_index[heads_pred[i, token_starts[j]]])
                        lpd.append(label_vocab[labels_pred[i,
                                                           token_starts[j]]])
                pred_heads.append(hpd)
                pred_labels.append(lpd)

            test_predict_words += batch_words
            test_predict_postags += batch_postags
            test_predict_heads += pred_heads
            test_predict_labels += pred_labels

        assert args.test_output is not None
        write_conll_examples(test_predict_words, test_predict_postags,
                             test_predict_heads, test_predict_labels,
                             args.test_output)

    if args.do_greedy_predict:
        model.eval()
        assert args.test_file is not None
        test_examples = read_conll_examples(args.test_file,
                                            is_training=False,
                                            has_confidence=False)
        test_features = convert_examples_to_features(test_examples,
                                                     tokenizer,
                                                     args.max_seq_length,
                                                     label_vocab2idx,
                                                     False,
                                                     has_confidence=False)
        logger.info("***** Running prediction *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.test_batch_size)
        all_example_ids = torch.tensor([f.example_id for f in test_features],
                                       dtype=torch.long)
        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.float32)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_lengths = torch.tensor([f.seq_len for f in test_features],
                                   dtype=torch.long)

        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_lengths,
                                  all_example_ids)

        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.test_batch_size)

        test_predict_words, test_predict_postags, test_predict_heads, test_predict_labels = [],[],[],[]
        for batch_id, batch in enumerate(
                tqdm(test_dataloader, desc="Predicting")):
            input_ids, input_mask, segment_ids, lengths, example_ids = batch
            example_ids = example_ids.numpy()
            batch_words = [
                test_features[eid].example.sentence for eid in example_ids
            ]
            batch_postags = [
                test_features[eid].example.postags for eid in example_ids
            ]
            batch_word_index = [
                test_features[eid].word_index for eid in example_ids
            ]  # token -> word
            batch_token_starts = [
                test_features[eid].token_starts for eid in example_ids
            ]  # word -> token start

            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            lengths = lengths.numpy()

            with torch.no_grad():
                heads_pred, labels_pred = model(input_ids,
                                                segment_ids,
                                                input_mask,
                                                greedy_inference=True)

            pred_heads = []
            pred_labels = []
            for i in range(len(batch_word_index)):
                word_index = batch_word_index[i]
                token_starts = batch_token_starts[i]
                hpd = []
                lpd = []
                for j in range(len(token_starts)):
                    if j == 0:  #[CLS]
                        continue
                    elif j == len(token_starts) - 1:  # [SEP]
                        continue
                    else:
                        hpd.append(word_index[heads_pred[i, token_starts[j]]])
                        lpd.append(label_vocab[labels_pred[i,
                                                           token_starts[j]]])
                pred_heads.append(hpd)
                pred_labels.append(lpd)

            test_predict_words += batch_words
            test_predict_postags += batch_postags
            test_predict_heads += pred_heads
            test_predict_labels += pred_labels

        assert args.test_output is not None
        write_conll_examples(test_predict_words, test_predict_postags,
                             test_predict_heads, test_predict_labels,
                             args.test_output)

    if args.do_ensemble_predict:
        assert args.test_file is not None
        test_examples = read_conll_examples(args.test_file,
                                            is_training=False,
                                            has_confidence=False)
        test_features = convert_examples_to_features(test_examples,
                                                     tokenizer,
                                                     args.max_seq_length,
                                                     label_vocab2idx,
                                                     False,
                                                     has_confidence=False)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.test_batch_size)
        all_example_ids = torch.tensor([f.example_id for f in test_features],
                                       dtype=torch.long)
        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.float32)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_lengths = torch.tensor([f.seq_len for f in test_features],
                                   dtype=torch.long)

        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_lengths,
                                  all_example_ids)

        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.test_batch_size)

        test_predict_words, test_predict_postags, test_predict_heads, test_predict_labels = [],[],[],[]
        for batch_id, batch in enumerate(
                tqdm(test_dataloader, desc="Predicting")):
            input_ids, input_mask, segment_ids, lengths, example_ids = batch
            example_ids = example_ids.numpy()
            batch_words = [
                test_features[eid].example.sentence for eid in example_ids
            ]
            batch_postags = [
                test_features[eid].example.postags for eid in example_ids
            ]
            batch_word_index = [
                test_features[eid].word_index for eid in example_ids
            ]  # token -> word
            batch_token_starts = [
                test_features[eid].token_starts for eid in example_ids
            ]  # word -> token start

            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            lengths = lengths.numpy()

            with torch.no_grad():
                energy_sum = None
                for model in models:
                    energy = model(input_ids, segment_ids, input_mask)
                    if energy_sum is None:
                        energy_sum = energy
                    else:
                        energy_sum = energy_sum + energy

                energy_sum = energy_sum / len(models)

            heads_pred, labels_pred = parser.decode_MST(
                energy_sum.cpu().numpy(),
                lengths,
                leading_symbolic=0,
                labeled=True)

            pred_heads = []
            pred_labels = []
            for i in range(len(batch_word_index)):
                word_index = batch_word_index[i]
                token_starts = batch_token_starts[i]
                hpd = []
                lpd = []
                for j in range(len(token_starts)):
                    if j == 0:  #[CLS]
                        continue
                    elif j == len(token_starts) - 1:  # [SEP]
                        continue
                    else:
                        hpd.append(word_index[heads_pred[i, token_starts[j]]])
                        lpd.append(label_vocab[labels_pred[i,
                                                           token_starts[j]]])
                pred_heads.append(hpd)
                pred_labels.append(lpd)

            test_predict_words += batch_words
            test_predict_postags += batch_postags
            test_predict_heads += pred_heads
            test_predict_labels += pred_labels

        assert args.test_output is not None
        write_conll_examples(test_predict_words, test_predict_postags,
                             test_predict_heads, test_predict_labels,
                             args.test_output)
Пример #20
0
               extra_features=extra_features)
    for sidx, (train_data, dev_data) in enumerate(mediqa_split_data):
        mediqa_train_fout = os.path.join(mt_dnn_root,
                                         'mediqa_{}_train.json'.format(sidx))
        mediqa_dev_fout = os.path.join(mt_dnn_root,
                                       'mediqa_{}_dev.json'.format(sidx))
        build_data(train_data,
                   mediqa_train_fout,
                   extra_features=extra_features)
        build_data(dev_data, mediqa_dev_fout, extra_features=extra_features)
    logger.info('done with mediqa')

    medquad_train_fout = os.path.join(mt_dnn_root, 'medquad_train.json')
    medquad_dev_fout = os.path.join(mt_dnn_root, 'medquad_dev.json')
    build_data(medquad_train_data, medquad_train_fout)
    build_data(medquad_dev_data, medquad_dev_fout)
    logger.info('done with medquad')


if __name__ == '__main__':
    args = parse_args()
    if args.sci_vocab:
        # default to uncased
        bert_tokenizer = BertTokenizer.from_pretrained(
            '../bert_models/scibert_scivocab_uncased/vocab.txt')
    elif args.cased:
        bert_tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
    else:
        bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    main(args)
Пример #21
0
def main(*_, **kwargs):
    use_cuda = torch.cuda.is_available() and kwargs["device"] >= 0
    device = torch.device("cuda:" +
                          str(kwargs["device"]) if use_cuda else "cpu")

    if use_cuda:
        torch.cuda.set_device(device)

    kwargs["use_cuda"] = use_cuda

    neptune.create_experiment(
        name="bert-span-parser",
        upload_source_files=[],
        params={
            k: str(v) if isinstance(v, bool) else v
            for k, v in kwargs.items()
        },
    )

    logger.info("Settings: {}", json.dumps(kwargs,
                                           indent=2,
                                           ensure_ascii=False))

    # For reproducibility
    os.environ["PYTHONHASHSEED"] = str(kwargs["seed"])
    random.seed(kwargs["seed"])
    np.random.seed(kwargs["seed"])
    torch.manual_seed(kwargs["seed"])
    torch.cuda.manual_seed_all(kwargs["seed"])
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Prepare and load data
    tokenizer = BertTokenizer.from_pretrained(kwargs["bert_model"],
                                              do_lower_case=False)

    logger.info("Loading data...")

    train_treebank = load_trees(kwargs["train_file"])
    dev_treebank = load_trees(kwargs["dev_file"])
    test_treebank = load_trees(kwargs["test_file"])

    logger.info(
        "Loaded {:,} train, {:,} dev, and {:,} test examples!",
        len(train_treebank),
        len(dev_treebank),
        len(test_treebank),
    )

    logger.info("Preprocessing data...")

    train_parse = [tree.convert() for tree in train_treebank]
    train_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()]
                       for tree in train_parse]
    dev_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()]
                     for tree in dev_treebank]
    test_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()]
                      for tree in test_treebank]

    logger.info("Data preprocessed!")

    logger.info("Preparing data for training...")

    tags = []
    labels = []

    for tree in train_parse:
        nodes = [tree]
        while nodes:
            node = nodes.pop()
            if isinstance(node, InternalParseNode):
                labels.append(node.label)
                nodes.extend(reversed(node.children))
            else:
                tags.append(node.tag)

    tag_encoder = LabelEncoder()
    tag_encoder.fit(tags, reserved_labels=["[PAD]", "[UNK]"])

    label_encoder = LabelEncoder()
    label_encoder.fit(labels, reserved_labels=[()])

    logger.info("Data prepared!")

    # Settings
    num_train_optimization_steps = kwargs["num_epochs"] * (
        (len(train_parse) - 1) // kwargs["batch_size"] + 1)
    kwargs["batch_size"] //= kwargs["gradient_accumulation_steps"]

    logger.info("Creating dataloaders for training...")

    train_dataloader, train_features = create_dataloader(
        sentences=train_sentences,
        batch_size=kwargs["batch_size"],
        tag_encoder=tag_encoder,
        tokenizer=tokenizer,
        is_eval=False,
    )
    dev_dataloader, dev_features = create_dataloader(
        sentences=dev_sentences,
        batch_size=kwargs["batch_size"],
        tag_encoder=tag_encoder,
        tokenizer=tokenizer,
        is_eval=True,
    )
    test_dataloader, test_features = create_dataloader(
        sentences=test_sentences,
        batch_size=kwargs["batch_size"],
        tag_encoder=tag_encoder,
        tokenizer=tokenizer,
        is_eval=True,
    )

    logger.info("Dataloaders created!")

    # Initialize model
    model = ChartParser.from_pretrained(
        kwargs["bert_model"],
        tag_encoder=tag_encoder,
        label_encoder=label_encoder,
        lstm_layers=kwargs["lstm_layers"],
        lstm_dim=kwargs["lstm_dim"],
        tag_embedding_dim=kwargs["tag_embedding_dim"],
        label_hidden_dim=kwargs["label_hidden_dim"],
        dropout_prob=kwargs["dropout_prob"],
    )

    model.to(device)

    # Prepare optimizer
    param_optimizers = list(model.named_parameters())

    if kwargs["freeze_bert"]:
        for p in model.bert.parameters():
            p.requires_grad = False
        param_optimizers = [(n, p) for n, p in param_optimizers
                            if p.requires_grad]

    # Hack to remove pooler, which is not used thus it produce None grad that break apex
    param_optimizers = [n for n in param_optimizers if "pooler" not in n[0]]

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in param_optimizers
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.01,
        },
        {
            "params": [
                p for n, p in param_optimizers
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]

    optimizer = BertAdam(
        optimizer_grouped_parameters,
        lr=kwargs["learning_rate"],
        warmup=kwargs["warmup_proportion"],
        t_total=num_train_optimization_steps,
    )

    if kwargs["fp16"]:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    pretrained_model_file = os.path.join(kwargs["output_dir"], MODEL_FILENAME)

    if kwargs["do_eval"]:
        assert os.path.isfile(
            pretrained_model_file), "Pretrained model file does not exist!"

        logger.info("Loading pretrained model from {}", pretrained_model_file)

        # Load model from file
        params = torch.load(pretrained_model_file, map_location=device)

        model.load_state_dict(params["model"])

        logger.info(
            "Loaded pretrained model (Epoch: {:,}, Fscore: {:.2f})",
            params["epoch"],
            params["fscore"],
        )

        eval_score = eval(
            model=model,
            eval_dataloader=test_dataloader,
            eval_features=test_features,
            eval_trees=test_treebank,
            eval_sentences=test_sentences,
            tag_encoder=tag_encoder,
            device=device,
        )

        neptune.send_metric("test_eval_precision", eval_score.precision())
        neptune.send_metric("test_eval_recall", eval_score.recall())
        neptune.send_metric("test_eval_fscore", eval_score.fscore())

        tqdm.write("Evaluation score: {}".format(str(eval_score)))
    else:
        # Training phase
        global_steps = 0
        start_epoch = 0
        best_dev_fscore = 0

        if kwargs["preload"] or kwargs["resume"]:
            assert os.path.isfile(
                pretrained_model_file), "Pretrained model file does not exist!"

            logger.info("Resuming model from {}", pretrained_model_file)

            # Load model from file
            params = torch.load(pretrained_model_file, map_location=device)

            model.load_state_dict(params["model"])

            if kwargs["resume"]:
                optimizer.load_state_dict(params["optimizer"])

                torch.cuda.set_rng_state_all([
                    state.cpu()
                    for state in params["torch_cuda_random_state_all"]
                ])
                torch.set_rng_state(params["torch_random_state"].cpu())
                np.random.set_state(params["np_random_state"])
                random.setstate(params["random_state"])

                global_steps = params["global_steps"]
                start_epoch = params["epoch"] + 1
                best_dev_fscore = params["fscore"]
        else:
            assert not os.path.isfile(
                pretrained_model_file
            ), "Please remove or move the pretrained model file to another place!"

        for epoch in trange(start_epoch, kwargs["num_epochs"], desc="Epoch"):
            model.train()

            train_loss = 0
            num_train_steps = 0

            for step, (indices, *_) in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                ids, attention_masks, tags, sections, trees, sentences = prepare_batch_input(
                    indices=indices,
                    features=train_features,
                    trees=train_parse,
                    sentences=train_sentences,
                    tag_encoder=tag_encoder,
                    device=device,
                )

                loss = model(
                    ids=ids,
                    attention_masks=attention_masks,
                    tags=tags,
                    sections=sections,
                    sentences=sentences,
                    gold_trees=trees,
                )

                if kwargs["gradient_accumulation_steps"] > 1:
                    loss /= kwargs["gradient_accumulation_steps"]

                if kwargs["fp16"]:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                train_loss += loss.item()

                num_train_steps += 1

                if (step + 1) % kwargs["gradient_accumulation_steps"] == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_steps += 1

            # Write logs
            neptune.send_metric("train_loss", epoch,
                                train_loss / num_train_steps)
            neptune.send_metric("global_steps", epoch, global_steps)

            tqdm.write(
                "Epoch: {:,} - Train loss: {:.4f} - Global steps: {:,}".format(
                    epoch, train_loss / num_train_steps, global_steps))

            # Evaluate
            eval_score = eval(
                model=model,
                eval_dataloader=dev_dataloader,
                eval_features=dev_features,
                eval_trees=dev_treebank,
                eval_sentences=dev_sentences,
                tag_encoder=tag_encoder,
                device=device,
            )

            neptune.send_metric("eval_precision", epoch,
                                eval_score.precision())
            neptune.send_metric("eval_recall", epoch, eval_score.recall())
            neptune.send_metric("eval_fscore", epoch, eval_score.fscore())

            tqdm.write("Epoch: {:,} - Evaluation score: {}".format(
                epoch, str(eval_score)))

            # Save best model
            if eval_score.fscore() > best_dev_fscore:
                best_dev_fscore = eval_score.fscore()

                tqdm.write("** Saving model...")

                os.makedirs(kwargs["output_dir"], exist_ok=True)

                torch.save(
                    {
                        "epoch":
                        epoch,
                        "global_steps":
                        global_steps,
                        "fscore":
                        best_dev_fscore,
                        "random_state":
                        random.getstate(),
                        "np_random_state":
                        np.random.get_state(),
                        "torch_random_state":
                        torch.get_rng_state(),
                        "torch_cuda_random_state_all":
                        torch.cuda.get_rng_state_all(),
                        "optimizer":
                        optimizer.state_dict(),
                        "model": (model.module if hasattr(model, "module") else
                                  model).state_dict(),
                    },
                    pretrained_model_file,
                )

            tqdm.write(
                "** Best evaluation fscore: {:.2f}".format(best_dev_fscore))
Пример #22
0
	seed = args.seed
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	if n_gpu > 1:
		torch.cuda.manual_seed_all(args.seed)

	## Import proper model
	if args.is_sequence_labeling:
		from model import BertForSequenceLabeling as Model
	else:
		from model import BertForSequenceClassification as Model

	logging.info('Building tokenizer...')
	tokenizer = BertTokenizer(args.pretrained_weights_dir+'vocab.txt')
	
	logging.info('Loading data...')
	data = Data(args.task_name,
				args.data_dir,
				tokenizer,
				args.max_seq_len,
				args.is_sequence_labeling)

	logging.info('Building Model...')
	model = Model.from_pretrained(args.pretrained_weights_dir, data.label_size)
	model.to(device)

	param_optimizer = list(model.named_parameters())
	no_decay = ['bias', 'gamma', 'beta']
	optimizer_grouped_parameters = [
Пример #23
0
def online_test_coref(config, input_text):
    """
    输入一段文本,进行指代消解任务
    :param config: 配置参数
    :return: None
    """
    def create_example(text):
        """将文字转为模型需要的样例格式"""
        sentences = [['[CLS]'] + tokenizer.tokenize_not_UNK(text) + ['[SEP]']]
        sentence_map = [0] * len(sentences[0])
        speakers = [["-" for _ in sentence] for sentence in sentences]
        subtoken_map = [i for i in range(len(sentences[0]))]
        return {
            "doc_key": "bn",
            "clusters": [],
            "sentences": sentences,
            "speakers": speakers,
            'sentence_map': sentence_map,
            'subtoken_map': subtoken_map
        }

    tokenizer = BertTokenizer.from_pretrained(config['vocab_file'],
                                              do_lower_case=True)
    online_coref_output_file = config['online_output_path']

    example = create_example(input_text)

    model = CorefModel.from_pretrained(config["model_save_path"],
                                       coref_task_config=config)
    model.to(device)

    model.eval()
    with open(online_coref_output_file, 'w', encoding="utf-8") as output_file:

        with torch.no_grad():
            tensorized_example = model.tensorize_example(example,
                                                         is_training=False)

            input_ids = torch.from_numpy(
                tensorized_example[0]).long().to(device)
            input_mask = torch.from_numpy(
                tensorized_example[1]).long().to(device)
            text_len = torch.from_numpy(
                tensorized_example[2]).long().to(device)
            speaker_ids = torch.from_numpy(
                tensorized_example[3]).long().to(device)
            genre = torch.tensor(tensorized_example[4]).long().to(device)
            is_training = tensorized_example[5]
            gold_starts = torch.from_numpy(
                tensorized_example[6]).long().to(device)
            gold_ends = torch.from_numpy(
                tensorized_example[7]).long().to(device)
            cluster_ids = torch.from_numpy(
                tensorized_example[8]).long().to(device)
            sentence_map = torch.Tensor(
                tensorized_example[9]).long().to(device)

            (_, _, _, top_span_starts, top_span_ends, top_antecedents, top_antecedent_scores), _ = \
                model(input_ids, input_mask, text_len, speaker_ids, genre,
                      is_training, gold_starts, gold_ends,
                      cluster_ids, sentence_map)

            predicted_antecedents = model.get_predicted_antecedents(
                top_antecedents.cpu(), top_antecedent_scores.cpu())
            # 预测实体索引
            example["predicted_clusters"], _ = model.get_predicted_clusters(
                top_span_starts, top_span_ends, predicted_antecedents)
            # 索引——>文字
            example_sentence = utils.flatten(example["sentences"])
            predicted_list = []
            for same_entity in example["predicted_clusters"]:
                same_entity_list = []
                num_same_entity = len(same_entity)
                for index in range(num_same_entity):
                    entity_name = ''.join(example_sentence[
                        same_entity[index][0]:same_entity[index][1] + 1])
                    same_entity_list.append(entity_name)
                predicted_list.append(same_entity_list)
                same_entity_list = []  # 清空list
            example["predicted_idx2entity"] = predicted_list

            example["top_spans"] = list(
                zip((int(i) for i in top_span_starts),
                    (int(i) for i in top_span_ends)))
            example['head_scores'] = []

            output_file.write(json.dumps(example, ensure_ascii=False))
            output_file.write("\n")
Пример #24
0
from utils.args import get_args
import torch
import numpy as np
from utils.reader import load_vocab
from bert.tokenization import BertTokenizer
from utils.parser import get_span_to_node_mapping, parse_tree
import csv, pickle
from collections import defaultdict
from utils.args import get_best_snapshot
from nns.linear_model import BOWRegression, BOWRegressionMulti
import argparse

args = get_args()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir='bert/cache')

def unigram_linear_pearson(filename):
    f = open(filename)
    model = torch.load(args.bow_snapshot, map_location='cpu')
    vocab = load_vocab(VOCAB)
    out, truth = [], []
    coeff_dict = {}
    scores_dict = defaultdict(list)
    valid, total = 0, 0
    for lidx, line in enumerate(f.readlines()):
        if lidx < MINLINE: continue
        if lidx == MAXLINE: break
        l = line.lower().strip().split('\t')
        for entry in l:
            items = entry.strip().split(' ')
            if len(items) > 2:
                continue
Пример #25
0
        self.dev_data = DataProcessor(path + 'dev.tsv', loader.load_dev_data,
                                      loader.label_map, tokenizer, max_seq_len,
                                      is_sequence_labeling)
        logging.info('Demo test data')
        self.test_data = DataProcessor(path + 'test.tsv',
                                       loader.load_test_data, loader.label_map,
                                       tokenizer, max_seq_len,
                                       is_sequence_labeling)

        self.label_size = len(loader.label_map)
        self.label_map = loader.label_map
        self.reverse_label_map = loader.reverse_label_map


if __name__ == '__main__':

    logging.info('Building tokenizer...')
    tokenizer = BertTokenizer('pretrained_weights/vocab.txt')

    logging.info('Loading data...')
    path = './data/CoLA/'
    data = Data('cola', path, tokenizer)

    logging.info('Loading data...')
    path = './data/MRPC/'
    data = Data('mrpc', path, tokenizer)

    logging.info('Loading data...')
    path = './data/NER/'
    data = Data('sequencelabeling', path, tokenizer, is_sequence_labeling=True)
Пример #26
0
                'att_mask': torch.LongTensor(input_mask)
            }
        return {
            'question': question,
            'passage': passage,
            'input': torch.LongTensor(input_ids).unsqueeze(0),
            'seg': torch.LongTensor(segment_ids).unsqueeze(0),
            'att_mask': torch.LongTensor(input_mask).unsqueeze(0)
        }


if __name__ == "__main__":
    print('load model')
    model = load_bert_model()
    print('convert')
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                              do_lower_case=True)
    cvt = BertInputConverter(tokenizer)
    examples = read_from_demo_txt_file()

    for question, passage in examples:
        sample = cvt.convert(question, passage, args.max_query_length,
                             args.max_seq_length)
        print('Question')
        print(sample['question'])
        print('Passage')
        print(sample['passage'])
        answer = predict_one_sample(model, sample)
        print('Answer')
        print(answer)
Пример #27
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
    parser.add_argument(
        "--bert_model",
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Store training data as on-disc memmaps to massively reduce memory usage"
    )

    parser.add_argument("--epochs",
                        type=int,
                        default=3,
                        help="Number of epochs to train for")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--log_every',
                        type=int,
                        default=100,
                        help="Log every X batch")
    parser.add_argument("--mlm_only",
                        action='store_true',
                        help="Only use MLM objective")
    args = parser.parse_args()

    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        print(
            f"Output directory ({args.output_dir}) already exists and is not empty!"
        )
    args.output_dir.mkdir(parents=True, exist_ok=True)

    logger = util.get_logger(f'{args.output_dir}/exp.txt')
    for key, value in vars(args).items():
        logger.info('command line argument: %s - %r', key, value)

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples /
                                       args.train_batch_size /
                                       args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    # Prepare model
    if args.mlm_only:
        model = BertForMaskedLM.from_pretrained(args.bert_model)
    else:
        model = BertForPreTraining.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    if args.mlm_only:
        param_optimizer = [
            x for x in param_optimizer if 'bert.pooler' not in x[0]
        ]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {total_train_examples}")
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(args.epochs):
        epoch_dataset = PregeneratedDataset(
            logger=logger,
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            mlm_only=args.mlm_only)
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        losses = []
        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                if args.mlm_only:
                    input_ids, input_mask, segment_ids, lm_label_ids = batch
                    loss = model(input_ids, segment_ids, input_mask,
                                 lm_label_ids)
                else:
                    input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                    loss = model(input_ids, segment_ids, input_mask,
                                 lm_label_ids, is_next)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)
                mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                losses.append(loss.item())
                if step % args.log_every == 0:
                    logger.info(
                        f"loss at ep {epoch} batch {step}/{len(train_dataloader)} is {np.mean(losses):.5f}"
                    )
                    losses = []
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

        # Save a trained model
        logger.info("** ** * Saving fine-tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = args.output_dir / f"epoch{epoch}_pytorch_model.bin"
        torch.save(model_to_save.state_dict(), str(output_model_file))
def fit_tfidf_model(dataset):
    if dataset == 'gab':
        data_processor = GabProcessor(configs)
    else:  # dataset is 'ws'
        configs.data_dir = './data/white_supremacy/'
        data_processor = WSProcessor(configs)

    model = LogisticRegression()
    tokenizer = BertTokenizer.from_pretrained(
        configs.bert_model, do_lower_case=configs.do_lower_case)

    train_examples, val_examples = data_processor.get_train_examples(configs.data_dir), \
                                            data_processor.get_dev_examples(configs.data_dir)
    random.shuffle(train_examples)

    gab_processor = GabProcessor(configs)
    gab_test_examples = gab_processor.get_test_examples(
        './data/majority_gab_dataset_25k/')

    _, train_labels, train_tokens = examples_to_bow(train_examples, tokenizer,
                                                    configs.max_seq_length)
    _, val_labels, val_tokens = examples_to_bow(val_examples, tokenizer,
                                                configs.max_seq_length)
    _, test_labels, test_tokens = examples_to_bow(gab_test_examples, tokenizer,
                                                  configs.max_seq_length)

    train_docs, val_docs = [' '.join(x) for x in train_tokens
                            ], [' '.join(x) for x in val_tokens]

    # binary BOW vector performs better than tfidf
    #vectorizer = TfidfVectorizer(tokenizer=str.split)
    vectorizer = CountVectorizer(binary=True)

    X = vectorizer.fit_transform(train_docs)

    neg_weight = 0.125 if dataset == 'ws' else 0.1
    weights = [1 if x == 1 else neg_weight for x in train_labels]

    model.fit(X, train_labels, weights)

    X_val = vectorizer.transform(val_docs)

    pred_gab_val = model.predict(X_val)
    f1 = f1_score(val_labels, pred_gab_val)
    print('val f1: %f' % f1)

    test_docs = [' '.join(x) for x in test_tokens]
    X_test = vectorizer.transform(test_docs)
    pred_gab_test = model.predict(X_test)
    gab_f1 = f1_score(test_labels, pred_gab_test)
    gab_p, gab_r = precision_score(test_labels, pred_gab_test), recall_score(
        test_labels, pred_gab_test)

    print('Gab test f1: %f (%f, %f)' % (gab_f1, gab_p, gab_r))

    ws_processor, nyt_processor = WSProcessor(configs), NytProcessor(
        configs, subset=dataset == 'ws')
    ws_test_examples = ws_processor.get_test_examples('data/white_supremacy')
    _, test_labels, test_tokens = examples_to_bow(ws_test_examples, tokenizer,
                                                  configs.max_seq_length)
    test_docs = [' '.join(x) for x in test_tokens]
    X_test = vectorizer.transform(test_docs)
    pred_ws_test = model.predict(X_test)
    ws_f1 = f1_score(test_labels, pred_ws_test)
    ws_p, ws_r = precision_score(test_labels, pred_ws_test), recall_score(
        test_labels, pred_ws_test)
    print('WS test f1: %f (%f, %f)' % (ws_f1, ws_p, ws_r))

    nyt_test_examples = nyt_processor.get_test_examples(
        'data/nyt_keyword_sample')
    _, test_labels, test_tokens = examples_to_bow(nyt_test_examples, tokenizer,
                                                  configs.max_seq_length)
    test_docs = [' '.join(x) for x in test_tokens]
    X_test = vectorizer.transform(test_docs)
    pred_nyt_test = model.predict(X_test)
    nyt_f1 = accuracy_score(test_labels, pred_nyt_test)
    print('Nyt test f1: %f' % nyt_f1)

    dump_coeff(model, vectorizer)
    return gab_f1, gab_p, gab_r, ws_f1, ws_p, ws_r, nyt_f1
Пример #29
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--negative_weight", default=1., type=float)
    parser.add_argument("--neutral_words_file", default='data/identity.csv')

    # if true, use test data instead of val data
    parser.add_argument("--test", action='store_true')

    # Explanation specific arguments below

    # whether run explanation algorithms
    parser.add_argument("--explain",
                        action='store_true',
                        help='if true, explain test set predictions')
    parser.add_argument("--debug", action='store_true')

    # which algorithm to run
    parser.add_argument("--algo", choices=['soc'])

    # the output filename without postfix
    parser.add_argument("--output_filename", default='temp.tmp')

    # see utils/config.py
    parser.add_argument("--use_padding_variant", action='store_true')
    parser.add_argument("--mask_outside_nb", action='store_true')
    parser.add_argument("--nb_range", type=int)
    parser.add_argument("--sample_n", type=int)

    # whether use explanation regularization
    parser.add_argument("--reg_explanations", action='store_true')
    parser.add_argument("--reg_strength", type=float)
    parser.add_argument("--reg_mse", action='store_true')

    # whether discard other neutral words during regularization. default: False
    parser.add_argument("--discard_other_nw",
                        action='store_false',
                        dest='keep_other_nw')

    # whether remove neutral words when loading datasets
    parser.add_argument("--remove_nw", action='store_true')

    # if true, generate hierarchical explanations instead of word level outputs.
    # Only useful when the --explain flag is also added.
    parser.add_argument("--hiex", action='store_true')
    parser.add_argument("--hiex_tree_height", default=5, type=int)

    # whether add the sentence itself to the sample set in SOC
    parser.add_argument("--hiex_add_itself", action='store_true')

    # the directory where the lm is stored
    parser.add_argument("--lm_dir", default='runs/lm')

    # if configured, only generate explanations for instances with given line numbers
    parser.add_argument("--hiex_idxs", default=None)
    # if true, use absolute values of explanations for hierarchical clustering
    parser.add_argument("--hiex_abs", action='store_true')

    # if either of the two is true, only generate explanations for positive / negative instances
    parser.add_argument("--only_positive", action='store_true')
    parser.add_argument("--only_negative", action='store_true')

    # stop after generating x explanation
    parser.add_argument("--stop", default=100000000, type=int)

    # early stopping with decreasing learning rate. 0: direct exit when validation F1 decreases
    parser.add_argument("--early_stop", default=5, type=int)

    # other external arguments originally here in pytorch_transformers

    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--validate_steps",
                        default=200,
                        type=int,
                        help="validate once for how many steps")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    combine_args(configs, args)
    args = configs

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {
        'gab': GabProcessor,
        'ws': WSProcessor,
        'nyt': NytProcessor,
        'MT': MTProcessor,
        #'multi-label': multilabel_Processor,
    }

    output_modes = {
        'gab': 'classification',
        'ws': 'classification',
        'nyt': 'classification'
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    #if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
    #    raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # save configs
    f = open(os.path.join(args.output_dir, 'args.json'), 'w')
    json.dump(args.__dict__, f, indent=4)
    f.close()

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    processor = processors[task_name](configs, tokenizer=tokenizer)
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()
    num_labels = len(label_list)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
            args.local_rank))
    if args.do_train:
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, cache_dir=cache_dir, num_labels=num_labels)

    else:
        model = BertForSequenceClassification.from_pretrained(
            args.output_dir, num_labels=num_labels)
    model.to(device)

    if args.fp16:
        model.half()

    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    # elif n_gpu > 1:
    #     model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
        warmup_linear = WarmupLinearSchedule(
            warmup=args.warmup_proportion,
            t_total=num_train_optimization_steps)

    else:
        if args.do_train:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss, tr_reg_loss = 0, 0
    tr_reg_cnt = 0
    epoch = -1
    val_best_f1 = -1
    val_best_loss = 1e10
    early_stop_countdown = args.early_stop

    if args.reg_explanations:
        train_lm_dataloder = processor.get_dataloader('train',
                                                      configs.train_batch_size)
        dev_lm_dataloader = processor.get_dataloader('dev',
                                                     configs.train_batch_size)
        explainer = SamplingAndOcclusionExplain(
            model,
            configs,
            tokenizer,
            device=device,
            vocab=tokenizer.vocab,
            train_dataloader=train_lm_dataloder,
            dev_dataloader=dev_lm_dataloader,
            lm_dir=args.lm_dir,
            output_path=os.path.join(configs.output_dir,
                                     configs.output_filename),
        )
    else:
        explainer = None

    if args.do_train:
        epoch = 0
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer, output_mode,
                                                      configs)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        class_weight = torch.FloatTensor([args.negative_weight, 1]).to(device)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                # define a new function to compute loss values for both output_modes
                logits = model(input_ids, segment_ids, input_mask, labels=None)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss(class_weight)
                    loss = loss_fct(logits.view(-1, num_labels),
                                    label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                tr_loss += loss.item()
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                # regularize explanations
                # NOTE: backward performed inside this function to prevent OOM

                if args.reg_explanations:
                    reg_loss, reg_cnt = explainer.compute_explanation_loss(
                        input_ids,
                        input_mask,
                        segment_ids,
                        label_ids,
                        do_backprop=True)
                    tr_reg_loss += reg_loss  # float
                    tr_reg_cnt += reg_cnt

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                if global_step % args.validate_steps == 0:
                    val_result = validate(args, model, processor, tokenizer,
                                          output_mode, label_list, device,
                                          num_labels, task_name, tr_loss,
                                          global_step, epoch, explainer)
                    val_acc, val_f1 = val_result['acc'], val_result['f1']
                    if val_f1 > val_best_f1:
                        val_best_f1 = val_f1
                        if args.local_rank == -1 or torch.distributed.get_rank(
                        ) == 0:
                            save_model(args, model, tokenizer, num_labels)
                    else:
                        # halve the learning rate
                        for param_group in optimizer.param_groups:
                            param_group['lr'] *= 0.5
                        early_stop_countdown -= 1
                        logger.info(
                            "Reducing learning rate... Early stop countdown %d"
                            % early_stop_countdown)
                    if early_stop_countdown < 0:
                        break
            if early_stop_countdown < 0:
                break
            epoch += 1

            # training finish ############################

    # if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
    #     if not args.explain:
    #         args.test = True
    #         validate(args, model, processor, tokenizer, output_mode, label_list, device, num_labels,
    #                  task_name, tr_loss, global_step=0, epoch=-1, explainer=explainer)
    #     else:
    #         args.test = True
    #         explain(args, model, processor, tokenizer, output_mode, label_list, device)
    if not args.explain:
        args.test = True
        print('--Test_args.test: %s' % str(args.test))  #Test_args.test: True
        validate(args,
                 model,
                 processor,
                 tokenizer,
                 output_mode,
                 label_list,
                 device,
                 num_labels,
                 task_name,
                 tr_loss,
                 global_step=888,
                 epoch=-1,
                 explainer=explainer)
        args.test = False
    else:
        print('--Test_args.test: %s' % str(args.test))  # Test_args.test: True
        args.test = True
        explain(args, model, processor, tokenizer, output_mode, label_list,
                device)
        args.test = False
Пример #30
0
    def __init__(self, config, coref_task_config):
        super(CorefModel, self).__init__(config)

        self.config = coref_task_config
        self.max_segment_len = self.config['max_segment_len']
        self.max_span_width = self.config["max_span_width"]
        self.genres = {g: i for i, g in enumerate(self.config["genres"])}
        self.subtoken_maps = {}
        self.gold = {}
        self.eval_data = None
        self.bert_config = modeling.BertConfig.from_json_file(self.config["bert_config_file"])
        self.tokenizer = BertTokenizer.from_pretrained(self.config['vocab_file'], do_lower_case=True)
        self.bert = BertModel(config=self.bert_config)
        self.dropout = nn.Dropout(self.config["dropout_rate"])
        self.emb_dim = self.bert_config.hidden_size*2 + int(self.config["use_features"])*20 + int(self.config["model_heads"])*self.bert_config.hidden_size
        self.slow_antecedent_dim = self.emb_dim*3 + int(self.config["use_metadata"])*40 + int(self.config["use_features"])*20 + int(self.config['use_segment_distance'])*20

        # span 长度 Embedding
        if self.config["use_features"]:
            self.span_width_embedding = nn.Embedding(
                                        num_embeddings=self.config["max_span_width"],
                                        embedding_dim=self.config["feature_size"])
        # span head Embedding(ok)
        if self.config["model_heads"]:
            print("------加入span head 信息------")
            self.masked_mention_score = nn.Sequential(
                                                nn.Linear(self.bert_config.hidden_size, 1),
                                                Squeezer(dim=1))

        # 计算指代得分,两层前向神经网络(ok)
        self.mention_scores = Score(self.emb_dim, self.config["ffnn_size"])

        # prior_width_embedding
        if self.config['use_prior']:
            self.span_width_prior_embeddings = nn.Embedding(
                                        num_embeddings=self.config["max_span_width"],
                                        embedding_dim=self.config["feature_size"])

            # 计算长度得分,两层前向神经网络
            self.width_scores = Score(self.config["feature_size"], self.config["ffnn_size"])

        # doc类别Embedding[7,20]
        self.genres_embedding = nn.Embedding(
                                        num_embeddings=len(self.genres),
                                        embedding_dim=self.config["feature_size"])

        # 前c个前指的得分  一个分类器 + dropout
        self.fast_antecedent_scores = nn.Sequential(
                                    nn.Linear(self.emb_dim, self.emb_dim),
                                    nn.Dropout(self.config["dropout_rate"]))
        # 前指距离embedding
        if self.config['use_prior']:
            self.antecedent_distance_embedding = nn.Embedding(
                                            num_embeddings=10,
                                            embedding_dim=self.config["feature_size"])

            self.antecedent_distance_linear = nn.Linear(self.config["feature_size"], 1)

        if self.config["use_metadata"]:
            # [2,20]
            self.same_speaker_embedding = nn.Embedding(
                                                num_embeddings=2,
                                                embedding_dim=self.config["feature_size"])
        if self.config["use_features"]:
            self.antecedent_offset_embedding = nn.Embedding(
                                            num_embeddings=10,
                                            embedding_dim=self.config["feature_size"])
        if self.config['use_segment_distance']:
            self.segment_distance_embedding = nn.Embedding(
                                            num_embeddings=self.config['max_training_sentences'],
                                            embedding_dim=self.config["feature_size"])

        # 三维的输入 ffnn 两层前向神经网络
        if self.config['fine_grained']:
            self.slow_antecedent_scores = nn.Sequential(
                nn.Linear(self.slow_antecedent_dim, self.config["ffnn_size"]),
                nn.ReLU(inplace=True),
                nn.Dropout(self.config["dropout_rate"]),
                nn.Linear(self.config["ffnn_size"], 1),
                Squeezer(dim=-1)
            )

            # 分类器 + sigmoid
            self.coref_layer_linear = nn.Sequential(
                            nn.Linear(self.emb_dim*2, self.emb_dim),
                            nn.Sigmoid()
            )

        self.apply(self.init_bert_weights)