def main(): onlinePreprocess.seq_length = opt.max_sent_length onlinePreprocess.max_doc_len = opt.max_doc_len onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0 onlinePreprocess.norm_lambda = opt.norm_lambda dataset = prepare_data_online(opt.train_src, opt.src_vocab, opt.train_tgt, opt.tgt_vocab, opt.train_oracle, opt.train_src_rouge, opt.train_src_section, opt.drop_too_short, opt.drop_too_long) trainData = neusum.Dataset( dataset['train']['src'], dataset['train']['src_raw'], dataset['train']['tgt'], dataset['train']['oracle'], dataset['train']['src_rouge'], dataset['train']['src_section'], dataset['train']['src_section_raw'], opt.batch_size, opt.max_doc_len, opt.gpus, dataset['train']['bert_annotation'], good_patterns=loglinear.Config.Keyword[opt.qtype], use_good=True) dicts = dataset['dicts'] print(' * vocabulary size. source = %d' % (dicts['src'].size())) print(' * number of training sentences. %d' % len(dataset['train']['src'])) print(' * maximum batch size. %d' % opt.batch_size) print('Building model...') sent_encoder = neusum.Models.Encoder(opt, dicts['src']) doc_encoder = neusum.Models.DocumentEncoder(opt) pointer = neusum.Models.Pointer(opt) if opt.dec_init == "simple": decIniter = neusum.Models.DecInit(opt) elif opt.dec_init == "att": decIniter = neusum.Models.DecInitAtt(opt) else: raise ValueError('Unknown decoder init method: {0}'.format( opt.dec_init)) model = neusum.Models.NMTModel(sent_encoder, doc_encoder, pointer, decIniter) log_linear_model = loglinear.model.LogLinear() log_linear_model.set_rules(opt.position_weight, opt.keyword_weight, loglinear.Config.Keyword[opt.qtype], opt.in_bert_weight, opt.in_section_weight, loglinear.Config.PossibleSection[opt.qtype], opt.section_embedding, opt.pre_word_vecs_enc) get_report(model, 'Neural-based Model') get_report(log_linear_model, 'Log-linear Model')
def buildData(self, srcBatch, srcRaw, tgtRaw, oracleBatch, srcRougeBatch, src_section_batch, src_section_raw, bert_annotation=None, good_patterns: List[str] = None, use_good: bool = False): """ (used in load_dev_data) """ srcData = [[self.src_dict.convertToIdx(b, neusum.Constants.UNK_WORD) for b in doc] for doc in srcBatch] srcBatchData = [[self.src_dict.convertToIdx(b, neusum.Constants.UNK_WORD) for b in doc] for doc in src_section_batch] return neusum.Dataset(srcData, srcRaw, tgtRaw, oracleBatch, srcRougeBatch, srcBatchData, src_section_raw, self.opt.batch_size, # self.opt.max_doc_len, self.opt.cuda, volatile=True) self.opt.max_doc_len, self.opt.cuda, bert_annotation=bert_annotation, good_patterns=good_patterns, use_good=use_good)
def buildData(self, srcBatch, srcRaw, tgtRaw, oracleBatch, srcRougeBatch): srcData = [[ self.src_dict.convertToIdx(b, neusum.Constants.UNK_WORD) for b in doc ] for doc in srcBatch] return neusum.Dataset(srcData, srcRaw, tgtRaw, oracleBatch, srcRougeBatch, self.opt.batch_size, self.opt.max_doc_len, self.opt.cuda, volatile=True)
def main(): if not opt.online_process_data: raise Exception( 'This code does not use preprocessed .pt pickle file. It has some issues with big files.' ) # dataset = torch.load(opt.data) else: import onlinePreprocess onlinePreprocess.seq_length = opt.max_sent_length onlinePreprocess.max_doc_len = opt.max_doc_len onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0 onlinePreprocess.norm_lambda = opt.norm_lambda from onlinePreprocess import prepare_data_online dataset = prepare_data_online(opt.train_src, opt.src_vocab, opt.train_tgt, opt.tgt_vocab, opt.train_oracle, opt.train_src_rouge, opt.train_src_section, opt.drop_too_short, opt.drop_too_long) trainData = neusum.Dataset( dataset['train']['src'], dataset['train']['src_raw'], dataset['train']['tgt'], dataset['train']['oracle'], dataset['train']['src_rouge'], dataset['train']['src_section'], dataset['train']['src_section_raw'], opt.batch_size, opt.max_doc_len, opt.gpus, dataset['train']['bert_annotation'], good_patterns=loglinear.Config.Keyword[opt.qtype], use_good=True) dicts = dataset['dicts'] # logger.info(' * vocabulary size. source = %d; target = %d' % # (dicts['src'].size(), dicts['tgt'].size())) logger.info(' * vocabulary size. source = %d' % (dicts['src'].size())) logger.info(' * number of training sentences. %d' % len(dataset['train']['src'])) logger.info(' * maximum batch size. %d' % opt.batch_size) # sent_encoder = loglinear.model.SentEncoder(opt, dicts['src']) # model = loglinear.model.LogLinear(sent_encoder) if opt.gpus: model = loglinear.model.LogLinear(use_gpu=True) else: model = loglinear.model.LogLinear(use_gpu=False) model.set_rules(opt.position_weight, opt.keyword_weight, loglinear.Config.Keyword[opt.qtype], opt.in_bert_weight, opt.in_section_weight, loglinear.Config.PossibleSection[opt.qtype], opt.section_embedding, opt.pre_word_vecs_enc) if len(opt.gpus) >= 1: model.cuda() else: model.cpu() if opt.freeze_word_vecs_enc: logger.warning('Not updating encoder word embedding.') # sent_encoder.load_pretrained_vectors(opt, logger) optim = neusum.Optim(opt.optim, opt.learning_rate, max_grad_norm=opt.max_grad_norm, max_weight_value=opt.max_weight_value, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, decay_bad_count=opt.halve_lr_bad_count) optim.set_parameters(model.parameters()) validData = None if opt.dev_input_src and opt.dev_ref: summarizer = neusum.Summarizer(opt, model, dataset) validData = load_dev_data( summarizer, opt.dev_input_src, opt.dev_ref, opt.dev_input_src_section, opt.drop_too_short, opt.drop_too_long, test_bert_annotation=opt.test_bert_annotation) trainModel(model, trainData, validData, dataset, optim)
def main(): if not opt.online_process_data: raise Exception( 'This code does not use preprocessed .pt pickle file. It has some issues with big files.' ) # dataset = torch.load(opt.data) else: import onlinePreprocess onlinePreprocess.seq_length = opt.max_sent_length onlinePreprocess.max_doc_len = opt.max_doc_len onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0 onlinePreprocess.norm_lambda = opt.norm_lambda from onlinePreprocess import prepare_data_online dataset = prepare_data_online(opt.train_src, opt.src_vocab, opt.train_tgt, opt.tgt_vocab, opt.train_oracle, opt.train_src_rouge) trainData = neusum.Dataset(dataset['train']['src'], dataset['train']['src_raw'], dataset['train']['tgt'], dataset['train']['oracle'], dataset['train']['src_rouge'], opt.batch_size, opt.max_doc_len, opt.gpus) dicts = dataset['dicts'] logger.info(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) logger.info(' * number of training sentences. %d' % len(dataset['train']['src'])) logger.info(' * maximum batch size. %d' % opt.batch_size) logger.info('Building model...') sent_encoder = neusum.Models.Encoder(opt, dicts['src']) doc_encoder = neusum.Models.DocumentEncoder(opt) pointer = neusum.Models.Pointer(opt, dicts['tgt']) if opt.dec_init == "simple": decIniter = neusum.Models.DecInit(opt) elif opt.dec_init == "att": decIniter = neusum.Models.DecInitAtt(opt) else: raise ValueError('Unknown decoder init method: {0}'.format( opt.dec_init)) model = neusum.Models.NMTModel(sent_encoder, doc_encoder, pointer, decIniter, rouge_calculator) summarizer = neusum.Summarizer(opt, model, dataset) if len(opt.gpus) >= 1: model.cuda() else: model.cpu() if opt.freeze_word_vecs_enc: logger.warning('Not updating encoder word embedding.') for pr_name, p in model.named_parameters(): logger.info(pr_name) # p.data.uniform_(-opt.param_init, opt.param_init) if p.dim() == 1: # p.data.zero_() p.data.normal_(0, math.sqrt(6 / (1 + p.size(0)))) else: xavier_normal(p, math.sqrt(3)) # xavier_uniform(p) sent_encoder.load_pretrained_vectors(opt, logger) optim = neusum.Optim(opt.optim, opt.learning_rate, max_grad_norm=opt.max_grad_norm, max_weight_value=opt.max_weight_value, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, decay_bad_count=opt.halve_lr_bad_count) optim.set_parameters(model.parameters()) validData = None if opt.dev_input_src and opt.dev_ref: validData = load_dev_data(summarizer, opt.dev_input_src, opt.dev_ref) trainModel(model, summarizer, trainData, validData, dataset, optim)