def make_language_pair_dataset_from_text( source_text_file: str, target_text_file: str, source_dict: pytorch_translate_dictionary.Dictionary, target_dict: pytorch_translate_dictionary.Dictionary, append_eos: Optional[bool] = False, reverse_source: Optional[bool] = True, ) -> data.LanguagePairDataset: return data.LanguagePairDataset( src=indexed_dataset.IndexedRawTextDataset( path=source_text_file, dictionary=source_dict, append_eos=append_eos, reverse_order=reverse_source, ), dst=indexed_dataset.IndexedRawTextDataset( path=target_text_file, dictionary=target_dict, # We always append EOS to the target sentence since we still want # the model to output an indication the sentence has finished, even # if we don't append the EOS symbol to the source sentence # (to prevent the model from misaligning UNKs or other words # to the frequently occurring EOS). append_eos=True, # We don't reverse the order of the target sentence, since # even if the source sentence is fed to the model backwards, # we still want the model to start outputting from the first word. reverse_order=False, ), pad_idx=source_dict.pad(), eos_idx=source_dict.eos(), )
def main(): parser = argparse.ArgumentParser(description='Batch translate') #parser.add_argument('--no-progress-bar', action='store_true', help='disable progress bar') parser.add_argument('--model', metavar='FILE', required=True, action='append', help='path(s) to model file(s)') parser.add_argument('--dictdir', metavar='DIR', required=True, help='directory of dictionary files') parser.add_argument('--batch-size', default=32, type=int, metavar='N', help='batch size') parser.add_argument('--beam', default=5, type=int, metavar='N', help='beam size (default: 5)') #parser.add_argument('--nbest', default=1, type=int, metavar='N', # help='number of hypotheses to output') #parser.add_argument('--remove-bpe', nargs='?', const='@@ ', default=None, # help='remove BPE tokens before scoring') parser.add_argument('--no-early-stop', action='store_true', help=('continue searching even after finalizing k=beam ' 'hypotheses; this is more correct, but increases ' 'generation time by 50%%')) #parser.add_argument('--unnormalized', action='store_true', # help='compare unnormalized hypothesis scores') parser.add_argument('--cpu', action='store_true', help='generate on CPU') parser.add_argument('--no-beamable-mm', action='store_true', help='don\'t use BeamableMM in attention layers') parser.add_argument('--lenpen', default=1, type=float, help='length penalty: <1.0 favors shorter, >1.0 favors longer sentences') parser.add_argument('--unkpen', default=0, type=float, help='unknown word penalty: <0 produces more unks, >0 produces fewer') #parser.add_argument('--replace-unk', nargs='?', const=True, default=None, # help='perform unknown replacement (optionally with alignment dictionary)') #parser.add_argument('--quiet', action='store_true', # help='Only print final scores') parser.add_argument('input', metavar='INPUT', help='Input file') args = parser.parse_args() # required by progress bar args.log_format = None USE_CUDA = not args.cpu and torch.cuda.is_available() print('Loading model...', file=sys.stderr) models, _ = utils.load_ensemble_for_inference(args.model, data_dir=args.dictdir) src_dic = models[0].src_dict dst_dic = models[0].dst_dict for model in models: model.make_generation_fast_(beamable_mm_beam_size=args.beam) translator = SequenceGenerator( models, beam_size=args.beam, stop_early=(not args.no_early_stop), len_penalty=args.lenpen, unk_penalty=args.unkpen) if USE_CUDA: translator.cuda() max_positions = min(model.max_encoder_positions() for model in models) print('Loading input data...', file=sys.stderr) raw_dataset = indexed_dataset.IndexedRawTextDataset(args.input, src_dic) dataset = fairseq.data.LanguageDatasets(SRC_LANG, TGT_LANG, src_dic, dst_dic) dataset.splits['test'] = fairseq.data.LanguagePairDataset( raw_dataset, raw_dataset, pad_idx=dataset.src_dict.pad(), eos_idx=dataset.src_dict.eos()) # itr = dataset.eval_dataloader( # 'test', max_sentences=args.batch_size, max_positions=max_positions) itr = dataset.eval_dataloader('test', max_sentences=args.batch_size) itr = utils.build_progress_bar(args, itr) #out = [] for sample_id, src_tokens, _, hypos in translator.generate_batched_itr( itr, cuda_device=0 if USE_CUDA else None): src_str = dataset.src_dict.string(src_tokens, '@@ ') #print(src_str) hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypos[0]['tokens'].int().cpu(), src_str=src_str, alignment=hypos[0]['alignment'].int().cpu(), align_dict=None, dst_dict=dataset.dst_dict, remove_bpe='@@ ') #out.append((sample_id, hypo_str)) print('{}\t{}'.format(sample_id, hypo_str), flush=True)