def main(): opt = parser.parse_args() seq_length = opt.max_sent_length logger.info(opt) opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = neusum.Summarizer(opt, logger=logger) outF = open(opt.output, 'w', encoding='utf-8') predScoreTotal, predWordsTotal, goldScoreTotal, goldWordsTotal = 0, 0, 0, 0 srcBatch, tgtBatch = [], [] src_raw, tgt_raw = [], [] count = 0 tgtF = open(opt.tgt) if opt.tgt else None for line in addone(open(opt.src, encoding='utf-8')): if line is not None: sline = line.strip() srcSents = sline.split('##SENT##') srcWords = [x.split(' ')[:seq_length] for x in srcSents] src_raw.append(srcSents) srcBatch.append(srcWords) if tgtF: tgtTokens = tgtF.readline().split(' ') if tgtF else None tgtBatch += [tgtTokens] # tgt_raw.append(tgtWords) if len(srcBatch) < opt.batch_size: continue else: # at the end of file, check last batch if len(srcBatch) == 0: break predBatch, predId, predScore, goldScore = translator.translate( srcBatch, src_raw, None) predScoreTotal += sum(score[0] for score in predScore) predWordsTotal += sum(len(x[0]) for x in predBatch) for b in range(len(predBatch)): count += 1 outF.write('{0}\t{1}'.format(predId[b], predBatch[b]) + '\n') outF.flush() srcBatch, tgtBatch = [], [] src_raw, tgt_raw = [], [] if tgtF: tgtF.close()
def main(): if not opt.online_process_data: raise Exception( 'This code does not use preprocessed .pt pickle file. It has some issues with big files.' ) # dataset = torch.load(opt.data) else: import onlinePreprocess onlinePreprocess.seq_length = opt.max_sent_length onlinePreprocess.max_doc_len = opt.max_doc_len onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0 onlinePreprocess.norm_lambda = opt.norm_lambda from onlinePreprocess import prepare_data_online dataset = prepare_data_online(opt.train_src, opt.src_vocab, opt.train_tgt, opt.tgt_vocab, opt.train_oracle, opt.train_src_rouge, opt.train_src_section, opt.drop_too_short, opt.drop_too_long) trainData = neusum.Dataset( dataset['train']['src'], dataset['train']['src_raw'], dataset['train']['tgt'], dataset['train']['oracle'], dataset['train']['src_rouge'], dataset['train']['src_section'], dataset['train']['src_section_raw'], opt.batch_size, opt.max_doc_len, opt.gpus, dataset['train']['bert_annotation'], good_patterns=loglinear.Config.Keyword[opt.qtype], use_good=True) dicts = dataset['dicts'] # logger.info(' * vocabulary size. source = %d; target = %d' % # (dicts['src'].size(), dicts['tgt'].size())) logger.info(' * vocabulary size. source = %d' % (dicts['src'].size())) logger.info(' * number of training sentences. %d' % len(dataset['train']['src'])) logger.info(' * maximum batch size. %d' % opt.batch_size) # sent_encoder = loglinear.model.SentEncoder(opt, dicts['src']) # model = loglinear.model.LogLinear(sent_encoder) if opt.gpus: model = loglinear.model.LogLinear(use_gpu=True) else: model = loglinear.model.LogLinear(use_gpu=False) model.set_rules(opt.position_weight, opt.keyword_weight, loglinear.Config.Keyword[opt.qtype], opt.in_bert_weight, opt.in_section_weight, loglinear.Config.PossibleSection[opt.qtype], opt.section_embedding, opt.pre_word_vecs_enc) if len(opt.gpus) >= 1: model.cuda() else: model.cpu() if opt.freeze_word_vecs_enc: logger.warning('Not updating encoder word embedding.') # sent_encoder.load_pretrained_vectors(opt, logger) optim = neusum.Optim(opt.optim, opt.learning_rate, max_grad_norm=opt.max_grad_norm, max_weight_value=opt.max_weight_value, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, decay_bad_count=opt.halve_lr_bad_count) optim.set_parameters(model.parameters()) validData = None if opt.dev_input_src and opt.dev_ref: summarizer = neusum.Summarizer(opt, model, dataset) validData = load_dev_data( summarizer, opt.dev_input_src, opt.dev_ref, opt.dev_input_src_section, opt.drop_too_short, opt.drop_too_long, test_bert_annotation=opt.test_bert_annotation) trainModel(model, trainData, validData, dataset, optim)
def main(): if not opt.online_process_data: raise Exception( 'This code does not use preprocessed .pt pickle file. It has some issues with big files.' ) # dataset = torch.load(opt.data) else: import onlinePreprocess onlinePreprocess.seq_length = opt.max_sent_length onlinePreprocess.max_doc_len = opt.max_doc_len onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0 onlinePreprocess.norm_lambda = opt.norm_lambda from onlinePreprocess import prepare_data_online dataset = prepare_data_online(opt.train_src, opt.src_vocab, opt.train_tgt, opt.tgt_vocab, opt.train_oracle, opt.train_src_rouge) trainData = neusum.Dataset(dataset['train']['src'], dataset['train']['src_raw'], dataset['train']['tgt'], dataset['train']['oracle'], dataset['train']['src_rouge'], opt.batch_size, opt.max_doc_len, opt.gpus) dicts = dataset['dicts'] logger.info(' * vocabulary size. source = %d; target = %d' % (dicts['src'].size(), dicts['tgt'].size())) logger.info(' * number of training sentences. %d' % len(dataset['train']['src'])) logger.info(' * maximum batch size. %d' % opt.batch_size) logger.info('Building model...') sent_encoder = neusum.Models.Encoder(opt, dicts['src']) doc_encoder = neusum.Models.DocumentEncoder(opt) pointer = neusum.Models.Pointer(opt, dicts['tgt']) if opt.dec_init == "simple": decIniter = neusum.Models.DecInit(opt) elif opt.dec_init == "att": decIniter = neusum.Models.DecInitAtt(opt) else: raise ValueError('Unknown decoder init method: {0}'.format( opt.dec_init)) model = neusum.Models.NMTModel(sent_encoder, doc_encoder, pointer, decIniter, rouge_calculator) summarizer = neusum.Summarizer(opt, model, dataset) if len(opt.gpus) >= 1: model.cuda() else: model.cpu() if opt.freeze_word_vecs_enc: logger.warning('Not updating encoder word embedding.') for pr_name, p in model.named_parameters(): logger.info(pr_name) # p.data.uniform_(-opt.param_init, opt.param_init) if p.dim() == 1: # p.data.zero_() p.data.normal_(0, math.sqrt(6 / (1 + p.size(0)))) else: xavier_normal(p, math.sqrt(3)) # xavier_uniform(p) sent_encoder.load_pretrained_vectors(opt, logger) optim = neusum.Optim(opt.optim, opt.learning_rate, max_grad_norm=opt.max_grad_norm, max_weight_value=opt.max_weight_value, lr_decay=opt.learning_rate_decay, start_decay_at=opt.start_decay_at, decay_bad_count=opt.halve_lr_bad_count) optim.set_parameters(model.parameters()) validData = None if opt.dev_input_src and opt.dev_ref: validData = load_dev_data(summarizer, opt.dev_input_src, opt.dev_ref) trainModel(model, summarizer, trainData, validData, dataset, optim)
decIniter = neusum.Models.DecInit(opt) elif opt.dec_init == "att": decIniter = neusum.Models.DecInitAtt(opt) else: raise ValueError('Unknown decoder init method: {0}'.format( opt.dec_init)) model = neusum.Models.NMTModel(sent_encoder, doc_encoder, pointer, decIniter) # load model logger.info('Loading trained model...') # model.load_state_dict(checkpoint['model']) model.load_state_dict(checkpoint['model']) summarizer = neusum.Summarizer(opt, model, dataset) if len(opt.gpus) >= 1: model.cuda() else: model.cpu() testData = load_dev_data(summarizer, opt.dev_input_src, opt.dev_ref, opt.dev_input_src_section, test_bert_annotation=opt.test_bert_annotation) model.eval() scores = evalModel(model, summarizer, testData, opt.output_len, 'test', opt.set_postfix, opt.stripping_mode, checkpoint['epoch'])