def do_format_to_bert(args): print(time.clock()) data_builder.format_to_bert(args) print(time.clock())
def summarize(input, num_sen=3): print(input) input = json.loads(input) articles = input.get("articles") bert_data_path = './files/my_bert_data/' log_file = './files/logs/output.log' model_path = './files/models/cnndm_bertsum_classifier_best.pt' results_path = './files/results/' json_path = './files/json' bert_data, all_sentences = sentence_splitter.get_articles_json(articles) with open(os.path.join(json_path, "test.1.json"), 'w+') as f: f.write(json.dumps(bert_data)) # Format to Bert args = Namespace() args.dataset = 'test' args.raw_path = json_path args.save_path = './files/bert.pt/' args.log_file = log_file args.oracle_mode = 'greedy' args.map_path = './files/data/' args.shard_size = 2000 args.min_nsents = 3 args.max_nsents = 100 args.min_src_ntokens = 5 args.max_src_ntokens = 200 args.lower = True args.n_cpus = 2 data_builder.format_to_bert(args) # Rename the file shutil.move("./files/bert.pt/test.1.bert.pt", "./files/bert.pt/.test.pt") # Get the predictions args = Namespace() args.encoder = 'classifier' args.mode = 'test' args.bert_data_path = './files/bert.pt/' args.model_path = './files/models/' args.result_path = results_path args.temp_dir = './temp' args.batch_size = 1000 args.use_interval = True args.large = False args.hidden_size = 128 args.ff_size = 512 args.heads = 4 args.inter_layers = 2 args.rnn_size = 512 args.param_init = 0 args.param_init_glorot = True args.dropout = 0.1 args.optim = 'adam' args.lr = 1 args.beta1 = 0.9 args.beta2 = 0.999 args.decay_method = '' args.warmup_steps = 8000 args.max_grad_norm = 0 args.save_checkpoint_steps = 5 args.accum_count = 1 args.world_size = 1 args.report_every = 1 args.train_steps = 1000 args.recall_eval = False args.visible_gpus = '-1' args.gpu_ranks = '0' args.log_file = log_file args.dataset = '' args.seed = 358 args.test_all = False args.model_name = model_path args.train_from = '' args.report_rouge = True args.block_trigram = True args.num_sen = num_sen args.gpu_ranks = [int(i) for i in args.gpu_ranks.split(',')] os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpus #init_logger(args.log_file) device = "cpu" if args.visible_gpus == '-1' else "cuda" device_id = 0 if device == "cuda" else -1 cp = args.model_name #step = int(cp.split('.')[-2].split('_')[-1]) step = 1000000 test(args, device_id, cp, step) # Format the output (at this stage all summaries are in lower case - lw) with open(os.path.join(results_path, "_step1000000.candidate"), 'r') as f: summaries = f.read() output = [] count = 0 for summary_lw in summaries.splitlines(): sentences = all_sentences[count] summary = '' for sentence_lw in summary_lw.split('<q>'): sentence = match_sentence(sentences, sentence_lw) output.append(sentence.strip()) count += 1 return {"output": output}
# parser.add_argument("-valid_src_path", default='data/train-small/short_text_t.txt') # parser.add_argument("-valid_tgt_path", default='data/train-small/summary_t.txt') # parser.add_argument("-test_src_path", default='data/test/short_text_t.txt') # parser.add_argument("-test_tgt_path", default='data/test/summary_t.txt') parser.add_argument("-train_src_path", default='small_data/train/short_text.txt') parser.add_argument("-train_tgt_path", default='small_data/train/summary.txt') parser.add_argument("-valid_src_path", default='small_data/train/short_text.txt') parser.add_argument("-valid_tgt_path", default='small_data/train/summary.txt') parser.add_argument("-test_src_path", default='small_data/test/short_text.txt') parser.add_argument("-test_tgt_path", default='small_data/test/summary.txt') parser.add_argument('-min_nsents', default=3, type=int) parser.add_argument('-max_nsents', default=100, type=int) parser.add_argument('-min_src_ntokens', default=0, type=int) parser.add_argument('-max_src_ntokens', default=200, type=int) parser.add_argument("-lower", type=str2bool, nargs='?', const=True, default=True) parser.add_argument('-log_file', default='') parser.add_argument('-dataset', nargs='+', default=['train', 'valid', 'test'], help='train, valid or test, defaul will process all datasets') parser.add_argument('-n_cpus', default=2, type=int) args = parser.parse_args() init_logger(args.log_file) result = data_builder.tokenize(args) data_builder.format_to_bert(args, result)
type=str, help='format_to_lines or format_to_bert') parser.add_argument( "-oracle_mode", default='greedy', type=str, help= 'how to generate oracle summaries, greedy or combination, combination will generate more accurate oracles but take much longer time.' ) parser.add_argument("-map_path", default='../data/') parser.add_argument("-raw_path", default='../my_json_data/') parser.add_argument("-save_path", default='../bert_data_final/') parser.add_argument("-shard_size", default=2000, type=int) parser.add_argument('-min_nsents', default=3, type=int) parser.add_argument('-max_nsents', default=100, type=int) parser.add_argument('-min_src_ntokens', default=5, type=int) parser.add_argument('-max_src_ntokens', default=200, type=int) parser.add_argument("-lower", type=str2bool, nargs='?', const=True, default=True) parser.add_argument('-log_file', default='../../logs/preprocess.log') parser.add_argument( '-dataset', default='test', help='train, valid or test, defaul will process all datasets') parser.add_argument('-n_cpus', default=2, type=int) args = parser.parse_args() data_builder.format_to_bert(args)
def summarize(text): with io.open('../raw_stories/test.story', 'w', encoding="utf8") as file: file.write(text.strip() + "\n\n@highlight\n\n" + "tim") # TOKENIZE # raw_stories -> merged_stories_tokenized parser = argparse.ArgumentParser() parser.add_argument("-mode", default='', type=str, help='format_to_lines or format_to_bert') parser.add_argument( "-oracle_mode", default='greedy', type=str, help= 'how to generate oracle summaries, greedy or combination, combination will generate more accurate oracles but take much longer time.' ) parser.add_argument("-map_path", default='../data/') parser.add_argument("-raw_path", default='../raw_stories/') parser.add_argument("-save_path", default='../merged_stories_tokenized/') parser.add_argument("-shard_size", default=2000, type=int) parser.add_argument('-min_nsents', default=3, type=int) parser.add_argument('-max_nsents', default=100, type=int) parser.add_argument('-min_src_ntokens', default=5, type=int) parser.add_argument('-max_src_ntokens', default=200, type=int) parser.add_argument("-lower", type=str2bool, nargs='?', const=True, default=True) parser.add_argument('-log_file', default='../logs/cnndm.log') parser.add_argument( '-dataset', default='', help='train, valid or test, defaul will process all datasets') parser.add_argument('-n_cpus', default=2, type=int) args = parser.parse_args() data_builder.tokenize(args) # FORMAT TO LINES # merged_stories_tokenized -> my_json_data parser = argparse.ArgumentParser() parser.add_argument("-mode", default='', type=str, help='format_to_lines or format_to_bert') parser.add_argument( "-oracle_mode", default='greedy', type=str, help= 'how to generate oracle summaries, greedy or combination, combination will generate more accurate oracles but take much longer time.' ) parser.add_argument("-map_path", default='../data/') parser.add_argument("-raw_path", default='../merged_stories_tokenized/') parser.add_argument("-save_path", default='../my_json_data/') parser.add_argument("-shard_size", default=2000, type=int) parser.add_argument('-min_nsents', default=3, type=int) parser.add_argument('-max_nsents', default=100, type=int) parser.add_argument('-min_src_ntokens', default=5, type=int) parser.add_argument('-max_src_ntokens', default=200, type=int) parser.add_argument("-lower", type=str2bool, nargs='?', const=True, default=True) parser.add_argument('-log_file', default='../logs/cnndm.log') parser.add_argument( '-dataset', default='', help='train, valid or test, defaul will process all datasets') parser.add_argument('-n_cpus', default=2, type=int) args = parser.parse_args() data_builder.format_to_lines_only_test(args) # FORMAT TO BERT # my_json_data -> bert_data_final parser = argparse.ArgumentParser() parser.add_argument("-mode", default='', type=str, help='format_to_lines or format_to_bert') parser.add_argument( "-oracle_mode", default='greedy', type=str, help= 'how to generate oracle summaries, greedy or combination, combination will generate more accurate oracles but take much longer time.' ) parser.add_argument("-map_path", default='../data/') parser.add_argument("-raw_path", default='../my_json_data/') parser.add_argument("-save_path", default='../bert_data_final/') parser.add_argument("-shard_size", default=2000, type=int) parser.add_argument('-min_nsents', default=3, type=int) parser.add_argument('-max_nsents', default=100, type=int) parser.add_argument('-min_src_ntokens', default=5, type=int) parser.add_argument('-max_src_ntokens', default=200, type=int) parser.add_argument("-lower", type=str2bool, nargs='?', const=True, default=True) parser.add_argument('-log_file', default='../../logs/preprocess.log') parser.add_argument( '-dataset', default='test', help='train, valid or test, defaul will process all datasets') parser.add_argument('-n_cpus', default=2, type=int) args = parser.parse_args() data_builder.format_to_bert(args) # GENERATE SUMMARY test_iter = data_loader.Dataloader(model_args, load_dataset(model_args, 'test', shuffle=False), model_args.batch_size, device, shuffle=False, is_test=True) trainer = build_trainer(model_args, device_id, model, None) result_string = trainer.test(test_iter, step) os.remove("../raw_stories/test.story") os.remove("../merged_stories_tokenized/test.story.json") os.remove("../my_json_data/test.0.json") os.remove("../bert_data_final/test.0.bert.pt") return result_string
def do_format_to_bert(args): print(time.clock()) args.mode = "format_to_bert" data_builder.format_to_bert(args) print(time.clock())