Пример #1
0
def build_model(options):
    model = Seq2Seq.load(ImageCaptioning,
                         options.model_path,
                         tok_dir=options.tokenizer_path,
                         use_obj=options.obj)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    num_gpu = torch.cuda.device_count()
    generator = BeamDecoder(model,
                            beam_width=options.beam_width,
                            max_len_a=options.max_len_a,
                            max_len_b=options.max_len_b,
                            len_penalty_ratio=options.len_penalty_ratio)
    if options.fp16:
        generator = amp.initialize(generator, opt_level="O2")
    if num_gpu > 1:
        generator = DataParallelModel(generator)
    return generator, model.text_processor
Пример #2
0
            [text_processor.lang_id(sentences[sid].strip().split(" ")[0])])
        yield sid, source_tokenized, torch.LongTensor(
            tids), candidates, src_lang, torch.LongTensor(target_langs)


if __name__ == "__main__":
    parser = get_option_parser()
    (options, args) = parser.parse_args()

    print("Loading text processor...")
    text_processor = TextProcessor(options.tokenizer_path)
    num_processors = max(torch.cuda.device_count(), 1)

    print("Loading model...")
    model = Seq2Seq.load(Seq2Seq,
                         options.model,
                         tok_dir=options.tokenizer_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    num_gpu = torch.cuda.device_count()

    assert num_gpu <= 1
    if options.fp16:
        model = amp.initialize(model, opt_level="O2")

    max_capacity = options.total_capacity * 1000000
    with torch.no_grad(), open(options.output, "w") as writer:
        print("Loading data...")
        with open(options.sens, "rb") as fp, open(options.data, "rb") as fp2:
            sentences = marshal.load(fp)
            src2dst_dict = marshal.load(fp2)
Пример #3
0
    def train(options):
        lex_dict = None
        if options.dict_path is not None:
            lex_dict = get_lex_dict(options.dict_path)
        if not os.path.exists(options.model_path):
            os.makedirs(options.model_path)

        text_processor = TextProcessor(options.tokenizer_path)
        assert text_processor.pad_token_id() == 0

        image_captioner = Seq2Seq.load(ImageCaptioning,
                                       options.pretrained_path,
                                       tok_dir=options.tokenizer_path)
        txt2ImageModel = Caption2Image(
            text_processor=text_processor,
            enc_layer=options.encoder_layer,
            embed_dim=options.embed_dim,
            intermediate_dim=options.intermediate_layer_dim)

        print("Model initialization done!")

        # We assume that the collator function returns a list with the size of number of gpus (in case of cpus,
        collator = dataset.ImageTextCollator()
        num_batches = max(1, torch.cuda.device_count())

        optimizer = build_optimizer(txt2ImageModel,
                                    options.learning_rate,
                                    warump_steps=options.warmup)

        trainer = Caption2ImageTrainer(
            model=txt2ImageModel,
            caption_model=image_captioner,
            mask_prob=options.mask_prob,
            optimizer=optimizer,
            clip=options.clip,
            beam_width=options.beam_width,
            max_len_a=options.max_len_a,
            max_len_b=options.max_len_b,
            len_penalty_ratio=options.len_penalty_ratio,
            fp16=options.fp16,
            mm_mode=options.mm_mode)

        pin_memory = torch.cuda.is_available()
        img_train_loader = ImageMTTrainer.get_img_loader(
            collator,
            dataset.ImageCaptionDatasetwNegSamples,
            options.train_path,
            txt2ImageModel,
            num_batches,
            options,
            pin_memory,
            lex_dict=lex_dict)

        img_dev_loader = ImageMTTrainer.get_img_loader(
            collator,
            dataset.ImageCaptionDatasetwNegSamples,
            options.dev_path,
            txt2ImageModel,
            num_batches,
            options,
            pin_memory,
            lex_dict=lex_dict,
            shuffle=False,
            denom=2)

        step, train_epoch = 0, 1
        while options.step > 0 and step < options.step:
            print("train epoch", train_epoch)
            step = trainer.train_epoch(img_data_iter=img_train_loader,
                                       img_dev_data_iter=img_dev_loader,
                                       max_step=options.step,
                                       lex_dict=lex_dict,
                                       saving_path=options.model_path,
                                       step=step)
            train_epoch += 1
Пример #4
0
import random
from seq2seq import Seq2Seq, END, pad_arrays
from mcts import Node, mcts
from preprocess import tokenize


model = Seq2Seq.load('model')

# Load base compounds
starting_mols = set()
with open('data/base_compounds.smi', 'r') as f:
    for smi in f:
        starting_mols.add(smi.strip())

print('Base compounds:', len(starting_mols))


def to_doc(mol):
    toks = tokenize(mol)
    return [model.vocab2id['<S>']] + [model.vocab2id[tok] for tok in toks] + [END]


def process_seq(seq):
    # Convert ids to tokens and drop START/END tokens
    smis = ''.join([model.id2vocab[id] for id in seq if id not in [0, 1]])
    parts = smis.split('>')
    if len(parts) > 1:
        # There shouldn't be more than two parts
        reactants, reagents = parts[0], parts[1]
    else:
        reactants = parts[0]
Пример #5
0
    def train(options):
        lex_dict = None
        if options.dict_path is not None:
            lex_dict = get_lex_dict(options.dict_path)
        if options.local_rank <= 0 and not os.path.exists(options.model_path):
            os.makedirs(options.model_path)

        text_processor = TextProcessor(options.tokenizer_path)
        assert text_processor.pad_token_id() == 0
        num_processors = max(torch.cuda.device_count(),
                             1) if options.local_rank < 0 else 1

        if options.pretrained_path is not None:
            mt_model = Seq2Seq.load(ImageMassSeq2Seq,
                                    options.pretrained_path,
                                    tok_dir=options.tokenizer_path)
        else:
            mt_model = ImageMassSeq2Seq(
                use_proposals=lex_dict is not None,
                tie_embed=options.tie_embed,
                text_processor=text_processor,
                resnet_depth=options.resnet_depth,
                lang_dec=options.lang_decoder,
                enc_layer=options.encoder_layer,
                dec_layer=options.decoder_layer,
                embed_dim=options.embed_dim,
                intermediate_dim=options.intermediate_layer_dim)

        if options.lm_path is not None:
            lm = LM(text_processor=text_processor,
                    enc_layer=options.encoder_layer,
                    embed_dim=options.embed_dim,
                    intermediate_dim=options.intermediate_layer_dim)
            mt_model.init_from_lm(lm)

        print("Model initialization done!")

        # We assume that the collator function returns a list with the size of number of gpus (in case of cpus,
        collator = dataset.ImageTextCollator()
        num_batches = max(1, torch.cuda.device_count())

        if options.continue_train:
            with open(os.path.join(options.pretrained_path, "optim"),
                      "rb") as fp:
                optimizer = pickle.load(fp)
        else:
            optimizer = build_optimizer(mt_model,
                                        options.learning_rate,
                                        warump_steps=options.warmup)
        trainer = ImageMTTrainer(model=mt_model,
                                 mask_prob=options.mask_prob,
                                 optimizer=optimizer,
                                 clip=options.clip,
                                 beam_width=options.beam_width,
                                 max_len_a=options.max_len_a,
                                 max_len_b=options.max_len_b,
                                 len_penalty_ratio=options.len_penalty_ratio,
                                 fp16=options.fp16,
                                 mm_mode=options.mm_mode,
                                 rank=options.local_rank)

        pin_memory = torch.cuda.is_available()
        img_train_loader = ImageMTTrainer.get_img_loader(
            collator,
            dataset.ImageCaptionDataset,
            options.train_path,
            mt_model,
            num_batches,
            options,
            pin_memory,
            lex_dict=lex_dict)

        mass_train_data, mass_train_loader, finetune_loader, mt_dev_loader = None, None, None, None
        if options.mass_train_path is not None:
            mass_train_paths = options.mass_train_path.strip().split(",")
            if options.step > 0:
                mass_train_data, mass_train_loader = ImageMTTrainer.get_mass_loader(
                    mass_train_paths,
                    mt_model,
                    num_processors,
                    options,
                    pin_memory,
                    keep_examples=options.finetune_step > 0,
                    lex_dict=lex_dict)

            if options.finetune_step > 0:
                finetune_loader, finetune_data = ImageMTTrainer.get_mass_finetune_data(
                    mass_train_data,
                    mass_train_paths,
                    mt_model,
                    num_processors,
                    options,
                    pin_memory,
                    lex_dict=lex_dict)

        mt_train_loader = None
        if options.mt_train_path is not None:
            mt_train_loader = ImageMTTrainer.get_mt_train_data(
                mt_model,
                num_processors,
                options,
                pin_memory,
                lex_dict=lex_dict)

        mt_dev_loader = None
        if options.mt_dev_path is not None:
            mt_dev_loader = ImageMTTrainer.get_mt_dev_data(mt_model,
                                                           options,
                                                           pin_memory,
                                                           text_processor,
                                                           trainer,
                                                           lex_dict=lex_dict)

        step, train_epoch = 0, 1
        while options.step > 0 and step < options.step:
            print("train epoch", train_epoch)
            step = trainer.train_epoch(img_data_iter=img_train_loader,
                                       mass_data_iter=mass_train_loader,
                                       mt_train_iter=mt_train_loader,
                                       max_step=options.step,
                                       lex_dict=lex_dict,
                                       mt_dev_iter=mt_dev_loader,
                                       saving_path=options.model_path,
                                       step=step,
                                       save_opt=options.save_opt,
                                       accum=options.accum)
            train_epoch += 1

        finetune_epoch = 0
        # Resetting the optimizer for the purpose of finetuning.
        trainer.optimizer.reset()

        lang_directions = ImageMTTrainer.get_lang_dirs(options.bt_langs,
                                                       text_processor)
        print(options.local_rank, "lang dirs", lang_directions)

        print(options.local_rank,
              "Reloading image train data with new batch size...")

        if options.finetune_step > 0 and img_train_loader is not None:
            img_train_loader = ImageMTTrainer.get_img_loader(
                collator,
                dataset.ImageCaptionDataset,
                options.train_path,
                mt_model,
                num_batches,
                options,
                pin_memory,
                denom=2,
                lex_dict=lex_dict)
        if options.ignore_mt_mass:
            mt_train_loader = None
        print(options.local_rank,
              "Reloading image train data with new batch size done!")

        while options.finetune_step > 0 and step <= options.finetune_step + options.step:
            print(options.local_rank, "finetune epoch", finetune_epoch)
            step = trainer.train_epoch(img_data_iter=img_train_loader,
                                       mass_data_iter=finetune_loader,
                                       mt_train_iter=mt_train_loader,
                                       max_step=options.finetune_step +
                                       options.step,
                                       mt_dev_iter=mt_dev_loader,
                                       saving_path=options.model_path,
                                       step=step,
                                       fine_tune=True,
                                       lang_directions=lang_directions,
                                       lex_dict=lex_dict,
                                       save_opt=options.save_opt,
                                       accum=options.accum,
                                       beam_width=options.bt_beam_width)
            finetune_epoch += 1
Пример #6
0
    def train(options):
        if not os.path.exists(options.model_path):
            os.makedirs(options.model_path)

        text_processor = TextProcessor(options.tokenizer_path)
        assert text_processor.pad_token_id() == 0
        num_processors = max(torch.cuda.device_count(), 1)

        mt_model = SenSim(text_processor=text_processor,
                          enc_layer=options.encoder_layer,
                          embed_dim=options.embed_dim,
                          intermediate_dim=options.intermediate_layer_dim)

        if options.pretrained_path is not None:
            pret = Seq2Seq.load(Seq2Seq,
                                options.pretrained_path,
                                tok_dir=options.tokenizer_path)
            mt_model.init_from_lm(pret)

        print("Model initialization done!")

        optimizer = build_optimizer(mt_model,
                                    options.learning_rate,
                                    warump_steps=options.warmup)
        trainer = SenSimTrainer(model=mt_model,
                                mask_prob=options.mask_prob,
                                optimizer=optimizer,
                                clip=options.clip,
                                fp16=options.fp16)

        pin_memory = torch.cuda.is_available()

        mt_train_loader = SenSimTrainer.get_mt_train_data(
            mt_model, num_processors, options, pin_memory)
        src_neg_data = dataset.MassDataset(
            batch_pickle_dir=options.src_neg,
            max_batch_capacity=num_processors * options.total_capacity * 5,
            max_batch=num_processors * options.batch * 5,
            pad_idx=mt_model.text_processor.pad_token_id(),
            keep_pad_idx=False,
            max_seq_len=options.max_seq_len,
            keep_examples=False)
        dst_neg_data = dataset.MassDataset(
            batch_pickle_dir=options.dst_neg,
            max_batch_capacity=num_processors * options.total_capacity * 5,
            max_batch=num_processors * options.batch * 5,
            pad_idx=mt_model.text_processor.pad_token_id(),
            keep_pad_idx=False,
            max_seq_len=options.max_seq_len,
            keep_examples=False)

        src_neg_loader = data_utils.DataLoader(src_neg_data,
                                               batch_size=1,
                                               shuffle=True,
                                               pin_memory=pin_memory)
        dst_neg_loader = data_utils.DataLoader(dst_neg_data,
                                               batch_size=1,
                                               shuffle=True,
                                               pin_memory=pin_memory)

        mt_dev_loader = None
        if options.mt_dev_path is not None:
            mt_dev_loader = SenSimTrainer.get_mt_dev_data(
                mt_model,
                options,
                pin_memory,
                text_processor,
                trainer,
            )

        step, train_epoch = 0, 1
        trainer.best_loss = 1000000
        while options.step > 0 and step < options.step:
            print("train epoch", train_epoch)
            step = trainer.train_epoch(mt_train_iter=mt_train_loader,
                                       max_step=options.step,
                                       mt_dev_iter=mt_dev_loader,
                                       saving_path=options.model_path,
                                       step=step,
                                       src_neg_iter=src_neg_loader,
                                       dst_neg_iter=dst_neg_loader)
            train_epoch += 1
Пример #7
0
    def train(options):
        lex_dict = None
        if options.dict_path is not None:
            lex_dict = get_lex_dict(options.dict_path)
        if not os.path.exists(options.model_path):
            os.makedirs(options.model_path)

        text_processor = TextProcessor(options.tokenizer_path)
        assert text_processor.pad_token_id() == 0

        if options.pretrained_path is not None:
            caption_model = Seq2Seq.load(ImageCaptioning,
                                         options.pretrained_path,
                                         tok_dir=options.tokenizer_path)
        else:
            caption_model = ImageCaptioning(
                use_proposals=lex_dict is not None,
                tie_embed=options.tie_embed,
                text_processor=text_processor,
                resnet_depth=options.resnet_depth,
                lang_dec=options.lang_decoder,
                enc_layer=options.encoder_layer,
                dec_layer=options.decoder_layer,
                embed_dim=options.embed_dim,
                intermediate_dim=options.intermediate_layer_dim,
                use_obj=not options.no_obj)

        if options.lm_path is not None:  # In our case, this is an MT model.
            mt_pret_model = Seq2Seq.load(ImageMassSeq2Seq,
                                         options.lm_path,
                                         tok_dir=options.tokenizer_path)
            assert len(caption_model.encoder.encoder.layer) == len(
                mt_pret_model.encoder.encoder.layer)
            assert len(caption_model.decoder.decoder.layer) == len(
                mt_pret_model.decoder.decoder.layer)
            caption_model.encoder = mt_pret_model.encoder
            caption_model.decoder = mt_pret_model.decoder
            caption_model.output_layer = mt_pret_model.output_layer

        print("Model initialization done!")

        # We assume that the collator function returns a list with the size of number of gpus (in case of cpus,
        collator = dataset.ImageTextCollator()
        num_batches = max(1, torch.cuda.device_count())

        if options.continue_train:
            with open(os.path.join(options.pretrained_path, "optim"),
                      "rb") as fp:
                optimizer = pickle.load(fp)
        else:
            optimizer = build_optimizer(caption_model,
                                        options.learning_rate,
                                        warump_steps=options.warmup)
        trainer = ImageCaptionTrainer(
            model=caption_model,
            mask_prob=options.mask_prob,
            optimizer=optimizer,
            clip=options.clip,
            beam_width=options.beam_width,
            max_len_a=options.max_len_a,
            max_len_b=options.max_len_b,
            len_penalty_ratio=options.len_penalty_ratio,
            fp16=options.fp16,
            mm_mode=options.mm_mode)

        pin_memory = torch.cuda.is_available()
        img_train_loader = ImageMTTrainer.get_img_loader(
            collator,
            dataset.ImageCaptionDataset,
            options.train_path,
            caption_model,
            num_batches,
            options,
            pin_memory,
            lex_dict=lex_dict,
            shuffle=(options.local_rank < 0))
        num_processors = max(torch.cuda.device_count(),
                             1) if options.local_rank < 0 else 1
        mt_train_loader = None
        if options.mt_train_path is not None:
            mt_train_loader = ImageMTTrainer.get_mt_train_data(
                caption_model,
                num_processors,
                options,
                pin_memory,
                lex_dict=lex_dict)

        img_dev_loader = ImageMTTrainer.get_img_loader(
            collator,
            dataset.ImageCaptionTestDataset,
            options.dev_path,
            caption_model,
            num_batches,
            options,
            pin_memory,
            lex_dict=lex_dict,
            shuffle=False,
            denom=2)

        trainer.caption_reference = None
        if img_dev_loader is not None:
            trainer.caption_reference = defaultdict(list)
            generator = (trainer.generator.module if hasattr(
                trainer.generator, "module") else trainer.generator)
            for data in img_dev_loader:
                for batch in data:
                    for b in batch:
                        captions = b["captions"]
                        for id in captions:
                            for caption in captions[id]:
                                refs = get_outputs_until_eos(
                                    text_processor.sep_token_id(),
                                    caption,
                                    remove_first_token=True)
                                ref = [
                                    generator.seq2seq_model.text_processor.
                                    tokenizer.decode(ref.numpy())
                                    for ref in refs
                                ]
                                trainer.caption_reference[id] += ref
            print("Number of dev image/captions",
                  len(trainer.caption_reference))

        mt_dev_loader = None
        if options.mt_dev_path is not None:
            mt_dev_loader = ImageMTTrainer.get_mt_dev_data(caption_model,
                                                           options,
                                                           pin_memory,
                                                           text_processor,
                                                           trainer,
                                                           lex_dict=lex_dict)
            print("Number of dev sentences", len(trainer.reference))

        step, train_epoch = 0, 1
        while options.step > 0 and step < options.step:
            print("train epoch", train_epoch)
            step = trainer.train_epoch(img_data_iter=img_train_loader,
                                       img_dev_data_iter=img_dev_loader,
                                       max_step=options.step,
                                       lex_dict=lex_dict,
                                       mt_train_iter=mt_train_loader,
                                       saving_path=options.model_path,
                                       step=step,
                                       accum=options.accum,
                                       mt_dev_iter=mt_dev_loader,
                                       mtl_weight=options.mtl_weight)
            train_epoch += 1