示例#1
0
    def __init__(self, from_model_name, from_model_checkpt, to_model_name, to_model_checkpt, 
        is_load_from_github=True, tokenzier_name='moses', bpe_name='fastbpe', 
        device='cuda'):
        super().__init__(device, temperature=None, top_k=None, top_p=None)

        try:
            import fairseq
            from fairseq.models.transformer import TransformerModel
        except ModuleNotFoundError:
            raise ModuleNotFoundError('Missed fairseq library. Install fairseq by https://github.com/pytorch/fairseq')
        
        self.from_model_name = from_model_name
        self.from_model_checkpt = from_model_checkpt
        self.to_model_name = to_model_name
        self.to_model_checkpt = to_model_checkpt
        self.is_load_from_github = is_load_from_github
        self.tokenzier_name = tokenzier_name
        self.bpe_name = bpe_name
        
        if is_load_from_github:
            self.from_model = torch.hub.load(
                github='pytorch/fairseq', model=from_model_name,
                checkpoint_file=from_model_checkpt,
                tokenizer=tokenzier_name, bpe=bpe_name)
            self.to_model = torch.hub.load(
                github='pytorch/fairseq', model=to_model_name,
                checkpoint_file=to_model_checkpt,
                tokenizer=tokenzier_name, bpe=bpe_name)
        else:
            try:
                self.from_model = TransformerModel.from_pretrained(
                    model_name_or_path=os.path.join(from_model_name, ''),
                    checkpoint_file=from_model_checkpt,
                    tokenizer=tokenzier_name, bpe=bpe_name)
            except TypeError:
                err_msg = 'Cannot load model from local path. You may check the following parameters are correct or not.'
                err_msg += ' Model Directory: ' + from_model_name
                err_msg += ', Checkpoint File Name: ' + from_model_checkpt
                err_msg += ', Tokenizer Name: ' + tokenzier_name
                err_msg += ', BPE Name: ' + bpe_name
                raise ValueError(err_msg)

            try:
                self.to_model = TransformerModel.from_pretrained(
                    model_name_or_path=os.path.join(to_model_name, ''),
                    checkpoint_file=to_model_checkpt,
                    tokenizer=tokenzier_name, bpe=bpe_name)
            except TypeError:
                err_msg = 'Cannot load model from local path. You may check the following parameters are correct or not.'
                err_msg += ' Model Directory: ' + to_model_name
                err_msg += ', Checkpoint File Name: ' + to_model_checkpt
                err_msg += ', Tokenizer Name: ' + tokenzier_name
                err_msg += ', BPE Name: ' + bpe_name
                raise ValueError(err_msg)

        self.from_model.eval()
        self.to_model.eval()
        if self.device == 'cuda':
            self.from_model.cuda()
            self.to_model.cuda()
示例#2
0
def translate(model_dir,
              in_file,
              out_file,
              batch_size,
              model_name,
              num_shards,
              shard_id,
              moses,
              spiece,
              lenpen,
              beam,
              ):
    if moses:
        tokenizer = 'moses'
    else:
        tokenizer = None
    if spiece:
        model = TransformerModel.from_pretrained(model_dir,
                                                 checkpoint_file=model_name,
                                                 data_name_or_path=model_dir,
                                                 bpe='sentencepiece',
                                                 sentencepiece_model=os.path.join(model_dir, 'spiece.model'),
                                                 tokenizer=tokenizer,
                                                 )
    else:
        model = TransformerModel.from_pretrained(model_dir,
                                                 checkpoint_file=model_name,
                                                 data_name_or_path=model_dir,
                                                 bpe='subword_nmt',
                                                 bpe_codes=os.path.join(model_dir, 'bpecodes'),
                                                 tokenizer=tokenizer,
                                                 )
                                                    

    start_id, end_id = get_line_ids(in_file, num_shards, shard_id)
    print(start_id, end_id)
    src_sents = []
    model.cuda()
    with open(in_file) as fin:
        for i, line in enumerate(fin):
            if start_id <= i < end_id:
                line = line.strip()
                src_sents.append(line)
    nb_sents = len(src_sents)
    nb_batches = (nb_sents+batch_size-1)//batch_size
    outputs = []
    for i in range(nb_batches):
        print('Batch ID: {}/{}'.format(i, nb_batches))
        output = model.translate(src_sents[i*batch_size:(i+1)*batch_size], lenpen=lenpen, beam=beam)
        outputs.extend(output)
    with open(out_file, 'wt') as fout:
        for output in outputs:
            fout.write(output)
            fout.write('\n')
    def load(self, device: str):
        """
        Load user-selected task-specific model
        Args:
            device (str): device information
        Returns:
            object: User-selected task-specific model
        """
        from pororo.tasks import PororoPosFactory

        if "transformer.large" in self.config.n_model:
            from fairseq.models.transformer import TransformerModel

            load_dict = download_or_load(
                f"transformer/{self.config.n_model}",
                self.config.lang,
            )

            model = (TransformerModel.from_pretrained(
                model_name_or_path=load_dict.path,
                checkpoint_file=f"{self.config.n_model}.pt",
                data_name_or_path=load_dict.dict_path,
                source_lang=load_dict.src_dict,
                target_lang=load_dict.tgt_dict,
            ).eval().to(device))

            morph2idx = pickle.load(
                open(
                    download_or_load(
                        f"misc/morph2idx.{self.config.lang}.pkl",
                        self.config.lang,
                    ),
                    "rb",
                ))
            tag2idx = pickle.load(
                open(
                    download_or_load(
                        f"misc/tag2idx.{self.config.lang}.pkl",
                        self.config.lang,
                    ),
                    "rb",
                ))
            query2origin, query2meaning, query2eng, _ = pickle.load(
                open(
                    download_or_load(
                        f"misc/wsd-dicts.{self.config.lang}.pkl",
                        self.config.lang,
                    ),
                    "rb",
                ))

            return PororoTransformerWsd(
                model,
                morph2idx,
                tag2idx,
                query2origin,
                query2meaning,
                query2eng,
                self.config,
            )
    def initialize(self, context):
        self._context = context
        self.initialized = True
        self.manifest = context.manifest

        properties = context.system_properties
        model_dir = properties.get("model_dir")

        self.device = torch.device("cuda:" +
                                   str(properties.get("gpu_id")) if torch.cuda.
                                   is_available() else "cpu")

        #read configs for the model_name, bpe etc. from setup_config.json
        setup_config_path = os.path.join(model_dir, "setup_config.json")
        if os.path.isfile(setup_config_path):
            with open(setup_config_path) as setup_config_file:
                self.setup_config = json.load(setup_config_file)
        else:
            logger.warning('Missing the setup_config.json file.')

        #  load the model
        self.model = TransformerModel.from_pretrained(
            model_dir,
            checkpoint_file='model.pt',
            data_name_or_path=model_dir,
            tokenizer='moses',
            bpe=self.setup_config["bpe"])
        self.model.to(self.device)
        self.model.eval()
        self.initialized = True
示例#5
0
文件: hubconf.py 项目: ictnlp/PTE-NMT
def transformer(*args, **kwargs):
    """
    Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017)
    <https://arxiv.org/abs/1706.03762>`_.
    """
    parser = options.get_interactive_generation_parser()
    model = TransformerModel.from_pretrained(parser, *args, **kwargs)
    return model
示例#6
0
    def __init__(self):
        self.langs = ["zh -> ar", "ar -> zh", "ar -> fr", "fr -> ar"]
        self.model_ar2zh = TransformerModel.from_pretrained(
            "checkpoints-ar2zh",
            checkpoint_file='checkpoint_best.pt',
            data_name_or_path='data-bin',
            bpe='subword_nmt',
            bpe_codes='data-bin/code')

        self.model_zh2ar = TransformerModel.from_pretrained(
            "checkpoints-zh2ar",
            checkpoint_file='checkpoint_best.pt',
            data_name_or_path='data-bin',
            bpe='subword_nmt',
            bpe_codes='data-bin/code')

        self.segmenter = FarasaSegmenter(interactive=True)
        self.models = {"ar2zh": self.model_ar2zh, "zh2ar": self.model_zh2ar}
示例#7
0
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        from pororo.tasks import PororoTokenizationFactory

        sent_tokenizer = (lambda text, lang: PororoTokenizationFactory(
            task="tokenization",
            lang=lang,
            model=f"sent_{lang}",
        ).load(device).predict(text))

        if "multi" in self.config.n_model:
            from fairseq.models.transformer import TransformerModel

            from pororo.tasks.utils.tokenizer import CustomTokenizer

            load_dict = download_or_load(
                f"transformer/{self.config.n_model}",
                self.config.lang,
            )

            model = (TransformerModel.from_pretrained(
                model_name_or_path=load_dict.path,
                checkpoint_file=f"{self.config.n_model}.pt",
                data_name_or_path=load_dict.dict_path,
                source_lang=load_dict.src_dict,
                target_lang=load_dict.tgt_dict,
            ).eval().to(device))

            tokenizer = CustomTokenizer.from_file(
                vocab_filename=f"{load_dict.src_tok}/vocab.json",
                merges_filename=f"{load_dict.src_tok}/merges.txt",
            )

            if "mtpg" in self.config.n_model:
                langtok_style = "mbart"
            elif "m2m" in self.config.n_model:
                langtok_style = "multilingual"
            else:
                langtok_style = "basic"

            return PororoTransformerTransMulti(
                model,
                self.config,
                tokenizer,
                sent_tokenizer,
                langtok_style,
            )
示例#8
0
def load_model(seed):
    """
    Given a seed (as a integer), load the corresponding model.
    """
    model = TransformerModel.from_pretrained(
      'work/checkpoints_seed' + str(seed),
      checkpoint_file='checkpoint_best.pt',
      data_name_or_path='work/processed_data/fairseq_preprocessed_data',
    ).to(device)
    model.eval()
    return model
示例#9
0
def english_to_french(text):
    print("path ---->", os.getcwd())
    en_to_fr = TransformerModel.from_pretrained(
        'translator_app/core/pretrained_models/wmt14.en-fr.fconv-py',
        checkpoint_file='model.pt',
        data_name_or_path=
        'translator_app/core/pretrained_models/wmt14.en-fr.fconv-py',
        bpe='fastbpe',
        bpe_codes=
        'translator_app/core/pretrained_models/wmt14.en-fr.fconv-py/bpecodes')

    return en_to_fr.translate(text)
示例#10
0
def load_smi_to_iupac_model():
    model = TransformerModel.from_pretrained(
        str(Path().absolute()),
        checkpoint_file=f'{root_dir}/checkpoints/checkpoint_best.pt',
        data_name_or_path=f'{root_dir}/data-bin/smi_iupac.smi-iupac/',
        bpe='subword_nmt',
        bpe_codes=f'{root_dir}/preprocess/smi_iupac/code')

    model.eval()

    print('Load the model OK!')
    return model
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """

        if "charbert" in self.config.n_model:
            from pororo.models.brainbert import CharBrainRobertaModel

            model = (CharBrainRobertaModel.load_model(
                f"bert/{self.config.n_model}",
                self.config.lang,
            ).eval().to(device))
            print(
                "As of now, this beta model tries to correct spacing errors in Korean text."
            )
            return PororoBertSpacing(model, self.config)

        if "transformer" in self.config.n_model:
            from fairseq.models.transformer import TransformerModel

            from pororo.tasks.utils.tokenizer import CustomTokenizer

            load_dict = download_or_load(
                f"transformer/{self.config.n_model}",
                self.config.lang,
            )

            tokenizer = None
            model = (TransformerModel.from_pretrained(
                model_name_or_path=load_dict.path,
                checkpoint_file=f"{self.config.n_model}.pt",
                data_name_or_path=load_dict.dict_path,
                source_lang=load_dict.src_dict,
                target_lang=load_dict.tgt_dict,
            ).eval().to(device))

            if "char" in self.config.n_model:
                return PororoTransformerGecChar(model, self.config)

            if load_dict.src_tok:
                tokenizer = CustomTokenizer.from_file(
                    vocab_filename=f"{load_dict.src_tok}/vocab.json",
                    merges_filename=f"{load_dict.src_tok}/merges.txt",
                )

            return PororoTransformerGec(model, tokenizer, device, self.config)
示例#12
0
 def __init__(self,
              model_dir,
              model_file,
              tokenizer='moses',
              bpe='subword_nmt',
              use_cuda=True):
     self.model = TransformerModel.from_pretrained(model_dir,
                                                   model_file,
                                                   tokenizer=tokenizer,
                                                   bpe=bpe)
     if use_cuda and torch.cuda.is_available():
         self.model.cuda()
示例#13
0
    def __init__(self):
        self.possible_dialects = ['standard', 'north', 'north_east', 'south']
        self.dialect_models = {}

        model_dir = os.getenv("G2P_MODEL_DIR", "/data/models/g2p/fairseq/")
        """ Select the paths based on dialect """
        for dialect in self.possible_dialects:
            data_dir = model_dir + '/data-bin/' + dialect
            checkpoint_file = model_dir + '/checkpoints/' + dialect + \
                '-256-.3-s-s/checkpoint_last.pt'
            self.dialect_models[dialect] = \
                TransformerModel.from_pretrained(data_dir, checkpoint_file)
示例#14
0
def backtranslation_using_en_de_model(args):
    task_name = args.task_name
    os.makedirs(args.output_dir, exist_ok=True)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.backends.cudnn.deterministic = True

    os.makedirs(args.output_dir, exist_ok=True)
    processor = get_task_processor(task_name, args.data_dir)
    # load train and dev data
    train_examples = processor.get_train_examples()

    # load the best model
    en_de_model = TransformerModel.from_pretrained(os.path.join(
        args.cache, "wmt19.en-de.joined-dict.single_model"),
                                                   checkpoint_file="model.pt",
                                                   tokenizer='moses',
                                                   bpe='fastbpe')

    de_en_model = TransformerModel.from_pretrained(os.path.join(
        args.cache, "wmt19.de-en.joined-dict.single_model"),
                                                   checkpoint_file="model.pt",
                                                   tokenizer='moses',
                                                   bpe='fastbpe')

    # en_de_model.to(device)
    # de_en_model.to(device)

    save_train_path = os.path.join(args.output_dir, "bt_aug.tsv")
    save_train_file = open(save_train_path, 'w')
    tsv_writer = csv.writer(save_train_file, delimiter='\t')
    for example in train_examples:
        text = example.text_a
        de_example = en_de_model.translate(text, remove_bpe=True)
        back_translated_example = de_en_model.translate(de_example,
                                                        remove_bpe=True)
        tsv_writer.writerow([example.label, back_translated_example])
示例#15
0
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "transformer" in self.config.n_model:
            from fairseq.models.transformer import TransformerModel

            from pororo.tasks import PororoPosFactory

            load_dict = download_or_load(
                f"transformer/{self.config.n_model}",
                self.config.lang,
            )

            model = (TransformerModel.from_pretrained(
                model_name_or_path=load_dict.path,
                checkpoint_file=f"{self.config.n_model}.pt",
                data_name_or_path=load_dict.dict_path,
                source_lang=load_dict.src_dict,
                target_lang=load_dict.tgt_dict,
            ).eval().to(device))

            if self.config.lang == "ko":
                tagger = PororoPosFactory(
                    task="pos",
                    model="mecab-ko",
                    lang=self.config.lang,
                ).load(device)
                return PororoTransConstKo(model, tagger, self.config)

            if self.config.lang == "en":
                tagger = PororoPosFactory(
                    task="pos",
                    model="nltk",
                    lang=self.config.lang,
                ).load(device)
                return PororoTransConstEn(model, tagger, self.config)

            if self.config.lang == "zh":
                tagger = PororoPosFactory(
                    task="pos",
                    model="jieba",
                    lang=self.config.lang,
                ).load(device)
                return PororoTransConstZh(model, tagger, self.config)
示例#16
0
文件: pan.py 项目: shshnk94/xencoder
def build_model():
    src_encoder = XLMRobertaModel.from_pretrained('xlm-roberta-large')
    en2fr = TransformerModel.from_pretrained(
        '/home/mindreese/xencoder/wmt14.en-fr.joined-dict.transformer/',
        checkpoint_file='model.pt',
        bpe='subword_nmt',
        bpe_codes=
        '/home/mindreese/xencoder/wmt14.en-fr.joined-dict.transformer/bpecodes'
    )
    tgt_encoder = [
        model for name, model in en2fr.named_modules()
        if name == 'models.0.encoder'
    ][0]

    return src_encoder, tgt_encoder
示例#17
0
 def __init__(self, lang: str):
     self.bart = TransformerModel.from_pretrained(
         "mbart50.ft.nn",
         checkpoint_file="model.pt",
         data_name_or_path="mbart50.ft.nn",
         bpe="sentencepiece",
         sentencepiece_model="mbart50.ft.nn/sentence.bpe.model",
         lang_dict="mbart50.ft.nn/ML50_langs.txt",
         target_lang=lang,
         source_lang="en_XX",
         encoder_langtok="src",
     )
     self.bart.eval()
     self.bart.to(
         torch.device("cuda" if torch.cuda.is_available() else "cpu"))
     self.lang = lang
示例#18
0
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if self.config.n_model == "p2g.zh":
            from pororo.models.p2g import P2gM

            pinyin = download_or_load(
                f"misc/pinyin2idx.{self.config.lang}.pkl",
                self.config.lang,
            )
            char = download_or_load(
                f"misc/char2idx.{self.config.lang}.pkl",
                self.config.lang,
            )
            ckpt = download_or_load(
                f"misc/{self.config.n_model}.pt",
                self.config.lang,
            )
            model = P2gM(pinyin, char, ckpt, device)
            return PororoP2GZh(model, self.config)

        if self.config.n_model == "p2g.ja":
            from fairseq.models.transformer import TransformerModel

            load_dict = download_or_load(
                "transformer/transformer.base.ja.p2g",
                self.config.lang,
            )

            model = (TransformerModel.from_pretrained(
                model_name_or_path=load_dict.path,
                checkpoint_file="transformer.base.ja.p2g.pt",
                data_name_or_path=load_dict.dict_path,
                source_lang=load_dict.src_dict,
                target_lang=load_dict.tgt_dict,
            ).eval().to(device))

            return PororoP2GJa(model, self.config)
示例#19
0
    def __init__(self):

        self.model = TransformerModel.from_pretrained(
            "/data/models/eng-isl-base-v1",
            checkpoint_file="checkpoint.en-is.avg8.pt",
            data_name_or_path="/data/models/eng-isl-base-v1",
            gpt2_encoder_json="/data/models/fairseq-eng-isl-base-std-parice/eng-isl-bbpe/eng-isl-bbpe-32k/eng-isl-bbpe-32k-vocab.json",
            gpt2_vocab_bpe="/data/models/fairseq-eng-isl-base-std-parice/eng-isl-bbpe/eng-isl-bbpe-32k/eng-isl-bbpe-32k-merges.txt",
            source_lang="en",
            target_lang="is",
            bpe="gpt2",
            beam=5,
            len_penalty=0.6,
            task="translation_with_backtranslation",
        )

        self.model.to("cpu")
        self.model.eval()
 def __init__(
     self,
     src_vocab_path,
     tgt_vocab_path,
     fairseq_path="/home/acb11204eq/data/wmt14_ende_fair/wmt14_ende_fairseq"
 ):
     self.src_vmap = self.build_vocab_map(
         src_vocab_path, "{}/dict.src.txt".format(fairseq_path))
     self.tgt_vmap = self.build_vocab_map(
         tgt_vocab_path, "{}/dict.tgt.txt".format(fairseq_path))
     model = TransformerModel.from_pretrained(
         fairseq_path,
         checkpoint_file="{}/checkpoint.pt".format(fairseq_path),
         data_name_or_path=fairseq_path)
     # model.translate("Yesterday , Gut@@ acht &apos;s Mayor gave a clear answer to this question .")
     if torch.cuda.is_available():
         model.cuda()
     self.transformer = model._modules["models"][0]
     self.transformer.train(False)
示例#21
0
    def __init__(self):
        self._tokenizer = MosesTokenizer("en")

        self._model_name = _EN_TH_MODEL_NAME

        _download_install(self._model_name)
        self._model = TransformerModel.from_pretrained(
            model_name_or_path=_get_translate_path(
                self._model_name,
                _EN_TH_FILE_NAME,
                "models",
            ),
            checkpoint_file="checkpoint.pt",
            data_name_or_path=_get_translate_path(
                self._model_name,
                _EN_TH_FILE_NAME,
                "vocab",
            ),
        )
def load_translate(dataset, testset):
    model = TransformerModel.from_pretrained(
        f'./checkpoint/{dataset}',
        checkpoint_file='checkpoint_best.pt',
        data_name_or_path=f'data-bin/{testset}',
        bpe='sentencepiece',
        sentencepiece_model='./bpe_model/ta.wiki.bpe.vs50000.model')

    model.eval()

    en2de.cuda()

    with open(f'intermediate_datasets/BPE/{testset}/test.en') as f:
        src_sentences = f.read().splitlines()

    with open(f'datasets/{testset}/test.ta') as f:
        ref_lines = f.read().splitlines()
    hyp_lines = model.translate(tqdm(src_sentences))

    with open(f'generation_results/{dataset}on{testset}.txt', 'w') as f:
        f.writelines(f'{sentence}\n' for sentence in hyp_lines)

    return hyp_lines, ref_lines
示例#23
0
    def __init__(self):
        self._model_name = _TH_EN_MODEL_NAME

        _download_install(self._model_name)
        self._model = TransformerModel.from_pretrained(
            model_name_or_path=_get_translate_path(
                self._model_name,
                _TH_EN_FILE_NAME,
                "models",
            ),
            checkpoint_file="checkpoint.pt",
            data_name_or_path=_get_translate_path(
                self._model_name,
                _TH_EN_FILE_NAME,
                "vocab",
            ),
            bpe="sentencepiece",
            sentencepiece_model=_get_translate_path(
                self._model_name,
                _TH_EN_FILE_NAME,
                "bpe",
                "spm.th.model",
            ),
        )
示例#24
0
def translate(input_file,
              output_file,
              device,
              folder,
              beam_size=3,
              batch_size=256,
              replace_unk=False):
    translator = TransformerModel.from_pretrained(
        folder, checkpoint_file='checkpoint_best.pt', beam=beam_size)
    translator.to(device)
    translator.eval()

    input_f = open(input_file, "r")
    output_f = open(output_file, "w")

    for batch in tqdm(chunked(input_f, batch_size)):
        for src, sentence in zip(batch, translator.translate(batch)):
            if replace_unk:
                sentence = sentence.replace("<unk>", "")
                sentence = sentence.replace("▁< unk >", "")
                sentence = sentence.replace("  ", " ")
            print("Source text: {}".format(src.strip()))
            print("Translation text: {}".format(sentence))
            print(sentence, file=output_f)
示例#25
0
def main():
    """
    Give the path of the source file (in tsv format) as an argument to the command in command line "run translator --source-file=..."
    Hard code the Target file, where the ouptut in german should be saved.
    """
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--source-file', required=True)

    args = parser.parse_args()

    source_file = args.source_file

    #check if source file is a valid file
    if not os.path.isfile(source_file) or not source_file_tsv.endswith('.tsv'):
        raise Exception(f"{source_file_tsv} is no valide file")

    en2de = TransformerModel.from_pretrained(
        f'{DATA_DIR}',
        checkpoint_file=f'{DATA_DIR}/model4.pt',
        data_name_or_path=f'{DATA_DIR}',
        bpe='fastbpe',
        bpe_codes=f'{DATA_DIR}/bpecodes',
        tokenizer='moses')

    lines_en = convert_tsv_lines_utf8_en_de(source_file)

    with open(TARGET_FILE, 'w') as target_tsv:
        target_tsv_writer = csv.writer(target_tsv, delimiter='\t')
        for line in lines_en:
            new_line_de = []
            for text_en in line:
                text_de = en2de.translate(text_en)
                new_line_de.append(text_de)
            target_tsv_writer.writerow(new_line_de)

    print('SUCCESS!')
示例#26
0
"""
Use AIResearch MT model easily
"""

import os

os.system('pip install sentencepiece')
os.system('pip install git+https://github.com/pytorch/fairseq@6f6461b')

from fairseq.models.transformer import TransformerModel

# download model
url = 'https://github.com/vistec-AI/model-releases/releases/download/SCB_1M%2BTBASE_v1.0/SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz'
os.system(f'curl -L {url} | tar xz')

model = TransformerModel.from_pretrained(
    model_name_or_path=
    'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0/models/',
    checkpoint_file='checkpoint.pt',
    data_name_or_path=
    'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0/vocab/',
    bpe='sentencepiece',
    sentencepiece_vocab=
    'SCB_1M-MT_OPUS+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.th.model')

# function en2th.translate
translate = model.translate
示例#27
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--en2fr', required=True, help='path to en2fr model')
    parser.add_argument('--fr2en',
                        required=True,
                        help='path to fr2en mixture of experts model')
    parser.add_argument(
        '--user-dir',
        help='path to fairseq examples/translation_moe/src directory')
    parser.add_argument('--num-experts',
                        type=int,
                        default=10,
                        help='(keep at 10 unless using a different model)')
    parser.add_argument('files',
                        nargs='*',
                        default=['-'],
                        help='input files to paraphrase; "-" for stdin')
    args = parser.parse_args()

    if args.user_dir is None:
        args.user_dir = os.path.join(
            os.path.dirname(os.path.dirname(
                os.path.abspath(__file__))),  # examples/
            'translation_moe',
            'src',
        )
        if os.path.exists(args.user_dir):
            logging.info('found user_dir:' + args.user_dir)
        else:
            raise RuntimeError(
                'cannot find fairseq examples/translation_moe/src '
                '(tried looking here: {})'.format(args.user_dir))

    logging.info('loading en2fr model from:' + args.en2fr)
    en2fr = TransformerModel.from_pretrained(
        model_name_or_path=args.en2fr,
        tokenizer='moses',
        bpe='sentencepiece',
    ).eval()

    logging.info('loading fr2en model from:' + args.fr2en)
    fr2en = TransformerModel.from_pretrained(
        model_name_or_path=args.fr2en,
        tokenizer='moses',
        bpe='sentencepiece',
        user_dir=args.user_dir,
        task='translation_moe',
    ).eval()

    def gen_paraphrases(en):
        fr = en2fr.translate(en)
        return [
            fr2en.translate(fr, inference_step_args={'expert': i})
            for i in range(args.num_experts)
        ]

    logging.info('Type the input sentence and press return:')
    for line in fileinput.input(args.files):
        line = line.strip()
        if len(line) == 0:
            continue
        for paraphrase in gen_paraphrases(line):
            print(paraphrase)
示例#28
0
    def run(self):
        def tokenize_for_bleu(target):
            target = tokenizer.decode_pieces(target.split())
            if self.target_lang == "ja":
                target = " ".join(
                    map(
                        lambda x: x.split("\t")[0],
                        tagger.parse(target).split("\n")[:-2],
                    ))
            return target

        docs = self.load()
        tagger = MeCab.Tagger()
        tokenizer = spm.SentencePieceProcessor()
        tokenizer.load(self.context_aware_sentencepiece_model)
        translation_models = {}
        for bias, path in self.context_aware_translation_models.items():
            base_path, checkpoint_path = os.path.split(path)
            model = (TransformerModel.from_pretrained(
                base_path,
                checkpoint_file=checkpoint_path).half().cuda().eval())
            model.args.max_source_positions = self.max_source_positions
            model.args.max_target_positions = self.max_target_positions
            translation_models[int(bias)] = model
        args = translation_models[-1].args
        task = translation_models[-1].task
        criterion = task.build_criterion(args)
        results = collections.defaultdict(dict)
        for doc_id, doc in tqdm.tqdm(docs.items(), total=len(docs)):
            parallel_doc = set([
                sent_id for sent_id, score in doc["pairs"]
                if score >= self.score_threhold
            ])
            batches = collections.defaultdict(dict)
            targets = {}
            for sent_id in parallel_doc:
                source, target = [
                    tokenizer.encode_as_pieces(doc[lang][sent_id])
                    for lang in (self.source_lang, self.target_lang)
                ]
                available_index = [
                    index for index in range(0, sent_id)
                    if doc[self.source_lang][index]
                ]
                # context_bias is the parameter which the model is trained with.
                # context_sent_index is the index of the actual used contextual
                # sentence.
                targets[sent_id] = " ".join(target)
                for context_bias, _ in translation_models.items():
                    context_sent_index = None
                    if context_bias != -1:
                        if len(available_index) < context_bias:
                            context_sent_index = -1
                        else:
                            context_sent_index = available_index[-context_bias]
                        source_context = tokenizer.encode_as_pieces(
                            docs[doc_id][self.source_lang][context_sent_index])
                        real_source = source_context + [CONCAT_TOKEN] + source
                    else:
                        real_source = source
                    if real_source and len(
                            real_source) < self.max_source_positions:
                        source_sentence = " ".join(real_source)
                    else:
                        source_sentence = None
                    batches[context_bias][sent_id] = source_sentence
            batch_results = collections.defaultdict(
                lambda: collections.defaultdict(dict))
            for context_bias, batch in batches.items():
                data = [sentence for sentence in batch.values() if sentence]
                if not data:
                    continue
                real_targets = {
                    sent_id: targets[sent_id]
                    for sent_id in batch if batch[sent_id]
                }
                model = translation_models[context_bias]
                args.max_source_positions = self.max_source_positions
                args.max_target_positions = self.max_target_positions
                translated = model.translate(data)
                # Compute BLEU score
                # Make the BLEU negative to easy the results computaion
                for trans, (sent_id, target) in zip(translated,
                                                    real_targets.items()):
                    batch_results[sent_id]["bleu"][
                        context_bias] = -sacrebleu.corpus_bleu(
                            tokenize_for_bleu(trans),
                            tokenize_for_bleu(target)).score
                # Compute loss
                src_tokens = [
                    model.src_dict.encode_line(
                        real_source,
                        line_tokenizer=lambda x: x.split(),
                        add_if_not_exist=False,
                    ).long() for real_source in data
                ]
                src_lengths = [tokens.numel() for tokens in src_tokens]
                tgt_tokens = [
                    model.tgt_dict.encode_line(
                        target,
                        line_tokenizer=lambda x: x.split(),
                        add_if_not_exist=False,
                    ).long() for target in real_targets.values()
                ]
                tgt_lengths = [tokens.numel() for tokens in tgt_tokens]
                temp_dataset = LanguagePairDataset(
                    src_tokens,
                    src_lengths,
                    model.src_dict,
                    tgt_tokens,
                    tgt_lengths,
                    left_pad_source=args.left_pad_source,
                    left_pad_target=args.left_pad_target,
                    max_source_positions=self.max_source_positions,
                    max_target_positions=self.max_target_positions,
                )
                reports = collections.defaultdict(list)
                iterator = task.get_batch_iterator(
                    dataset=temp_dataset,
                    max_sentences=self.max_sentences,
                )
                for sample in iterator.next_epoch_itr(shuffle=False):
                    sample["net_input"]["src_tokens"] = sample["net_input"][
                        "src_tokens"].cuda()
                    sample["net_input"]["src_lengths"] = sample["net_input"][
                        "src_lengths"].cuda()
                    sample["net_input"]["prev_output_tokens"] = sample[
                        "net_input"]["prev_output_tokens"].cuda()
                    sample["target"] = sample["target"].cuda()
                    with torch.no_grad():
                        _, _, report = criterion(model.models[0], sample,
                                                 False)
                    for key, value in report.items():
                        reports[key].append(value)
                for key in ("loss", "nll_loss"):
                    for value, (sent_id, _) in zip(torch.cat(reports[key]),
                                                   real_targets.items()):
                        batch_results[sent_id][key][context_bias] = float(
                            value)
            for sent_id, value in batch_results.items():
                results[doc_id][sent_id] = value
        self.dump(dict(results))
示例#29
0
def main():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("--en2fr", required=True, help="path to en2fr model")
    parser.add_argument(
        "--fr2en", required=True, help="path to fr2en mixture of experts model"
    )
    parser.add_argument(
        "--user-dir", help="path to fairseq examples/translation_moe/src directory"
    )
    parser.add_argument(
        "--num-experts",
        type=int,
        default=10,
        help="(keep at 10 unless using a different model)",
    )
    parser.add_argument(
        "files",
        nargs="*",
        default=["-"],
        help='input files to paraphrase; "-" for stdin',
    )
    args = parser.parse_args()

    if args.user_dir is None:
        args.user_dir = os.path.join(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),  # examples/
            "translation_moe",
            "src",
        )
        if os.path.exists(args.user_dir):
            logging.info("found user_dir:" + args.user_dir)
        else:
            raise RuntimeError(
                "cannot find fairseq examples/translation_moe/src "
                "(tried looking here: {})".format(args.user_dir)
            )

    logging.info("loading en2fr model from:" + args.en2fr)
    en2fr = TransformerModel.from_pretrained(
        model_name_or_path=args.en2fr,
        tokenizer="moses",
        bpe="sentencepiece",
    ).eval()

    logging.info("loading fr2en model from:" + args.fr2en)
    fr2en = TransformerModel.from_pretrained(
        model_name_or_path=args.fr2en,
        tokenizer="moses",
        bpe="sentencepiece",
        user_dir=args.user_dir,
        task="translation_moe",
    ).eval()

    def gen_paraphrases(en):
        fr = en2fr.translate(en)
        return [
            fr2en.translate(fr, inference_step_args={"expert": i})
            for i in range(args.num_experts)
        ]

    logging.info("Type the input sentence and press return:")
    for line in fileinput.input(args.files):
        line = line.strip()
        if len(line) == 0:
            continue
        for paraphrase in gen_paraphrases(line):
            print(paraphrase)
示例#30
0
        sys.exit("'BPE codes' argument missing! Should be subword-nmt created with learn_bpe.py")

    if len(sys.argv) > 4:
        input_file = os.path.abspath(sys.argv[4])
    else:
        sys.exit("'Input text' argument missing!")

    if len(sys.argv) > 5:
        output_file = os.path.abspath(sys.argv[5])
    else:
        sys.exit("'Output text' argument missing!")

    with open(input_file, 'r') as f:
        text = f.read().strip().splitlines()

    fout = open(output_file, 'x')

    nopuncts2puncts = TransformerModel.from_pretrained(
        model_path,
        checkpoint_file='checkpoint_best.pt',
        data_name_or_path=data_path,
        bpe='subword_nmt',
        bpe_codes=bpe_codes
    )

    # Punctuate
    textout = nopuncts2puncts.translate(text)

    fout.write('\n'.join(textout))
    fout.close()