def model_fn(features, targets): mode = tf.contrib.learn.ModeKeys.INFER features = _decode_input_tensor_to_features_dict(features, hparams) dp = data_parallelism() model_class = Transformer(hparams, mode, dp) result_list = model_class.infer( features, beam_size=FLAGS.decode_beam_size, top_beams=(FLAGS.decode_beam_size if FLAGS.decode_return_beams else 1), alpha=FLAGS.decode_alpha, decode_length=FLAGS.decode_extra_length) if not isinstance(result_list, dict): ## greedy ret = {"outputs": result_list}, None, None else: ## beam ret = { "outputs": result_list["outputs"], "scores": result_list["scores"] }, None, None if "inputs" in features: ret[0]["inputs"] = features["inputs"] if "infer_targets" in features: ret[0]["targets"] = features["infer_targets"] return ret
def __init__(self, num_points=2000, K=3): # Call the super constructor super(PointNetBase, self).__init__() # Input transformer for K-dimensional input # K should be 3 for XYZ coordinates, but can be larger if normals, # colors, etc are included self.input_transformer = Transformer(num_points, K) # Embedding transformer is always going to be 64 dimensional self.embedding_transformer = Transformer(num_points, 64) # Multilayer perceptrons with shared weights are implemented as # convolutions. This is because we are mapping from K inputs to 64 # outputs, so we can just consider each of the 64 K-dim filters as # describing the weight matrix for each point dimension (X,Y,Z,...) to # each index of the 64 dimension embeddings self.mlp1 = nn.Sequential(nn.Conv1d(K, 64, 1), nn.BatchNorm1d(64), nn.ReLU(), nn.Conv1d(64, 64, 1), nn.BatchNorm1d(64), nn.ReLU()) self.mlp2 = nn.Sequential(nn.Conv1d(64, 64, 1), nn.BatchNorm1d(64), nn.ReLU(), nn.Conv1d(64, 128, 1), nn.BatchNorm1d(128), nn.ReLU(), nn.Conv1d(128, 1024, 1), nn.BatchNorm1d(1024), nn.ReLU())
def setUp(self): self._feature_dims = [128, 4, 39] self._targets_dim_list = [5] model_cfg = {"vocab_size": 128} self.model = Transformer(self._feature_dims, self._targets_dim_list, model_cfg) self.sess = tf.Session()
def get_model(): """Build the model for the n-th problem, plus some added variables.""" model_class = Transformer(hparams, mode, dp) ##!!!! sharded_logits, training_loss, extra_loss = model_class.model_fn( features) with tf.variable_scope("losses_avg", reuse=True): loss_moving_avg = tf.get_variable("training_loss") o1 = loss_moving_avg.assign(loss_moving_avg * 0.9 + training_loss * 0.1) loss_moving_avg = tf.get_variable("extra_loss") o2 = loss_moving_avg.assign(loss_moving_avg * 0.9 + extra_loss * 0.1) loss_moving_avg = tf.get_variable("total_loss") total_loss = training_loss + extra_loss o3 = loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1) with tf.variable_scope( "train_stats"): # Count steps for this problem. problem_steps = tf.get_variable("steps", initializer=0, trainable=False) o4 = problem_steps.assign_add(1) with tf.control_dependencies([o1, o2, o3, o4]): # Make sure the ops run. # Ensure the loss is a scalar here. total_loss = tf.reshape(total_loss, [], name="total_loss_control_id") return [total_loss ] + sharded_logits # Need to flatten for cond later.
def add_context_tr_emb_op(self): hparams = {"num_units": 300, "dropout": self.dropout, "is_training":True, "num_multi_head": 6, "num_heads":6, "max_seq_len":512} with tf.variable_scope("context-bi-transformer"): transformer = Transformer(hparams) output = transformer.encoder(self.word_embeddings, self.words_len) # self.context_emb = tf.nn.dropout(output, self.dropout) self.context_emb = output
def generate_predictions(input_file_path: str, pred_file_path: str): """Generates predictions for the machine translation task (EN->FR). You are allowed to modify this function as needed, but one again, you cannot modify any other part of this file. We will be importing only this function in our final evaluation script. Since you will most definitely need to import modules for your code, you must import these inside the function itself. Args: input_file_path: the file path that contains the input data. pred_file_path: the file path where to store the predictions. Returns: None """ ##### MODIFY BELOW ##### data_dir = '/project/cq-training-1/project2/teams/team12/data/' best_model_path = 'saved_model/Transformer-num_layers_2-d_model_128-num_heads_8-dff_512_fr_to_en_False_embedding_'\ 'None_embedding_dim_128_back_translation_True_ratio_4.0' path_en = os.path.join(data_dir, 'train.lang1') path_fr = os.path.join(data_dir, 'train.lang2') # Create vocabs logger.info('Creating vocab...') word2idx_en, idx2word_en = utils.create_vocab(path_en, vocab_size=None) word2idx_fr, idx2word_fr = utils.create_vocab(path_fr, vocab_size=None) # Load data logger.info('Loading data...') data = utils.load_data(input_file_path, word2idx_en) dataset = tf.data.Dataset.from_generator( lambda: [ex for ex in data], tf.int64, output_shapes=tf.TensorShape([None ])).padded_batch(128, padded_shapes=[None]) # Load model model_config = { 'num_layers': 2, 'd_model': 128, 'dff': 512, 'num_heads': 8 } model = Transformer(model_config, len(word2idx_en), word2idx_fr) model.load_weights(os.path.join(best_model_path, "model")) # Write prediction to file with open(pred_file_path, 'w') as f: logger.info('Opening file and writing predictions...') for batch in tqdm(dataset, desc='Translating...', total=len(data) // 128 + 1): preds = model({'inputs': batch, 'labels': tf.zeros_like(batch)}) for pred in preds: sentence = utils.generate_sentence_from_probabilities( pred.numpy(), idx2word_fr) f.writelines([sentence, '\n'])
def main(): config = Config() parser = argparse.ArgumentParser() parser.add_argument('--batch_size', '-b', type=int, default=64, help='batch size for train') parser.add_argument('--epoch', '-e', type=int, default=50, help='number of training epochs') parser.add_argument('--n_layer', '-n', type=int, default=6, help='number of encoder layers') parser.add_argument('-seed', '-s', type=int, default=123, help="Random seed") parser.add_argument('--save_model', '-m', action='store_true', default=False, help="whether to save model") parser.add_argument('--checkpoint', '-c', type=int, default=0, help="load model") args = parser.parse_args() ########test########## # args.batch_size = 2 ########test########## if args.batch_size: config.batch_size = args.batch_size if args.n_layer: config.n_layer = args.n_layer # seed torch.manual_seed(args.seed) # rouge initalization open(config.filename_rouge, 'w') model = Transformer(config) if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) if torch.cuda.is_available(): model = model.cuda() train(args, config, model)
def __init__(self, inp, oup, stride, channels, kernel_sizes, expand, active_fn=None, batch_norm_kwargs=None, se_ratio=0.5, use_transformer=False, downsampling_transformer=False): super(InvertedResidualChannelsFused, self).__init__() assert stride in [1, 2] assert len(channels) == len(kernel_sizes) self.input_dim = inp self.output_dim = oup self.expand = expand self.stride = stride self.kernel_sizes = kernel_sizes self.channels = channels self.use_res_connect = self.stride == 1 and inp == oup self.batch_norm_kwargs = batch_norm_kwargs self.active_fn = active_fn self.se_ratio = se_ratio self.use_transformer = use_transformer self.downsampling_transformer = downsampling_transformer (self.expand_conv, self.depth_ops, self.project_conv, self.se_op) = self._build(channels, kernel_sizes, expand, se_ratio) if not self.use_res_connect: # assert (self.input_dim % min(self.input_dim, self.output_dim) == 0 # and self.output_dim % min(self.input_dim, self.output_dim) == 0) group = [ x for x in range(1, self.input_dim + 1) if self.input_dim % x == 0 and self.output_dim % x == 0 ][-1] self.residual = nn.Conv2d(self.input_dim, self.output_dim, kernel_size=1, stride=self.stride, padding=0, groups=group, bias=False) if self.use_transformer and self.use_res_connect: self.transformer = Transformer(8, inp) if self.use_transformer and self.downsampling_transformer and not self.use_res_connect: self.transformer = Transformer(8, inp, oup, downsampling=(stride == 2))
def load_model(self): self.model = Transformer(basic_params=self.basic_params, encoder_params=self.encoder_params, decoder_params=self.decoder_params, src_pad_idx=PAD, tgt_pad_idx=PAD) if self.basic_params["paths"]["prev_model"]: self.model.load_state_dict( torch.load(self.basic_params["paths"]["prev_model"])) ckpt = self.model.state_dict() torch.save(ckpt, self.basic_params["ckpt_path"])
def __init__(self, step=1, if_noise=False, noise_dim=3, noise_stdv=1e-2, dim_tail=32): super(StepModelTransformer, self).__init__() self.step = step self.if_noise = if_noise self.noise_dim = noise_dim self.noise_stdv = noise_stdv self.dim_tail = dim_tail self.sa_module_1 = PointNet_SA_Module( 512, 32, 0.2, 3 + (self.noise_dim if self.if_noise else 0), [64, 64, 128], group_all=False) self.transformer_start_1 = Transformer(128, dim=64) self.sa_module_2 = PointNet_SA_Module(128, 32, 0.4, 128, [128, 128, 256], group_all=False) self.transformer_start_2 = Transformer(256, dim=64) self.sa_module_3 = PointNet_SA_Module(None, None, None, 256, [256, 512, 1024], group_all=True) self.fp_module_3 = PointNet_FP_Module(1024, [256, 256], use_points1=True, in_channel_points1=256) self.fp_module_2 = PointNet_FP_Module(256, [256, 128], use_points1=True, in_channel_points1=128) self.fp_module_1 = PointNet_FP_Module(128, [128, 128, 128], use_points1=True, in_channel_points1=6) self.unit_3 = Unit(step=step, in_channel=256) self.unit_2 = Unit(step=step, in_channel=128) self.unit_1 = Unit(step=step, in_channel=128) mlp = [128, 64, 3] last_channel = 128 + self.dim_tail # (32 if self.step == 1 else 0) mlp_conv = [] for out_channel in mlp[:-1]: mlp_conv.append(Conv1d(last_channel, out_channel, if_bn=True)) last_channel = out_channel mlp_conv.append( Conv1d(last_channel, mlp[-1], if_bn=False, activation_fn=None)) self.mlp_conv = nn.Sequential(*mlp_conv)
def main(args): """ Main function for overall process. Arguments: | args: Arguments used for overall process. The process is fairly straightforward. Training is conducted first with evaluation being \ conducted at the end of each training epoch. The best model is saved as a PyTorch file. """ global_process_start = time.time() msg_format = '[%(asctime)s - %(levelname)s - %(filename)s: %(lineno)d (%(funcName)s)] %(message)s' logging.basicConfig(format=msg_format, level=logging.INFO, \ handlers=[logging.FileHandler(filename=args.log_filename), logging.StreamHandler()]) data = WMT2014Dataset(args) model = Transformer(args) if args.multiple_gpu: logger.info("Using multiple GPU's!") os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' model = nn.DataParallel(model) model = model.to('cuda') wandb.watch(model) train_start = time.time() best_pred, best_epoch = train(args, model, data) train_end = time.time() logger.info( f"Training took approximately {time.strftime('%H:%M:%S', time.gmtime(train_end - train_start))}" ) # If we evaluated during training, write predictions. if best_pred: pred_filename = f"../predictions/{args.wandb_name}_pred_epoch{best_epoch}.txt" logger.info(f"Writing predictions and targets to {pred_filename}.") with open(file=pred_filename, mode='w') as f: f.write('\n'.join(best_pred) + '\n') model_file_name = args.log_filename.split('/')[-1] model_save_file = os.path.join(args.model_save_dir, model_file_name) logger.info( f"Saving model in {args.model_save_dir} as {args.log_filename}") torch.save(model.state_dict(), model_save_file) global_process_end = time.time() logger.info( f"End of process. Took approximately {time.strftime('%H:%M:%S', time.gmtime(global_process_end - global_process_start))}" )
def build_paddle_model(): # * backbone backbone = 'resnet50' dilation = False position_embedding = 'sine' # * Transformer enc_layers = 6 dec_layers = 6 dim_feedforward = 2048 hidden_dim = 256 dropout = 0 nheads = 8 num_queries = 100 pre_norm = False num_classes = 91 position_embedding = PositionEmbeddingSine(hidden_dim // 2, normalize=True) backbone = Backbone(backbone, False, True, dilation) backbone = Joiner(backbone, position_embedding) backbone.num_channels = 2048 transformer = Transformer( d_model=hidden_dim, dropout=dropout, nhead=nheads, dim_feedforward=dim_feedforward, num_encoder_layers=enc_layers, num_decoder_layers=dec_layers, normalize_before=pre_norm, return_intermediate_dec=True) model = DETR(backbone, transformer, num_classes=num_classes, num_queries=num_queries, aux_loss=True) return model
def _make_detr( backbone_name: str, num_queries=100, mask=False, qa_dataset=None, predict_final=False, text_encoder="roberta-base", contrastive_align_loss=True, ): hidden_dim = 256 backbone = _make_backbone(backbone_name, mask) transformer = Transformer(d_model=hidden_dim, return_intermediate_dec=True, text_encoder_type=text_encoder) detr = MDETR( backbone, transformer, num_classes=255, num_queries=num_queries, qa_dataset=qa_dataset, predict_final=predict_final, contrastive_align_loss=contrastive_align_loss, contrastive_hdim=64, ) if mask: return DETRsegm(detr) return detr
def Decode(save_file): dataset_dev = ASR_align_ArkDataSet(scp_file=args.dirs.dev.scp, trans_file=args.dirs.dev.trans, align_file=None, feat_len_file=None, args=args, _shuffle=False, transform=False) feature_dev = TFData(dataset=dataset_dev, dir_save=args.dirs.dev.tfdata, args=args).read(_shuffle=False, transform=True) feature_dev = feature_dev.padded_batch(args.batch_size, ((), [None, args.dim_input])) _, model_infer = Transformer(args) # model.summary() model_infer.summary() optimizer = tf.keras.optimizers.Adam(1e-4) ckpt = tf.train.Checkpoint(model=model_infer, optimizer=optimizer) _ckpt_manager = tf.train.CheckpointManager(ckpt, args.dirs.checkpoint, max_to_keep=1) ckpt.restore(_ckpt_manager.latest_checkpoint) print('checkpoint {} restored!!'.format(_ckpt_manager.latest_checkpoint)) cer = evaluate(feature_dev, dataset_dev, args.data.dev_size, model_infer) print('PER:{:.3f}'.format(cer))
def main(): num_classes = 91 device = torch.device('cuda') backbone = build_backbone() transformer = Transformer( d_model=256, dropout=0.1, nhead=8, dim_feedforward=2048, num_encoder_layers=6, num_decoder_layers=6, normalize_before=False, return_intermediate_dec=True, ) model = DETR( backbone, transformer, num_classes=num_classes, num_queries=100, aux_loss=True, ) checkpoint = torch.load('./detr-r50-e632da11.pth') model.load_state_dict(checkpoint['model']) model.to(device) model.eval() gen_wts(model, "detr")
def instantiate_model(model_name, vocab_size, embeddings): multi_layer_args = yaml.load(open('./configs/multi_layer.yml'), Loader=yaml.FullLoader) if model_name == "rcnn": model_args = yaml.load(open('./configs/rcnn.yml'), Loader=yaml.FullLoader) model = RCNN(vocab_size, embeddings, **{**model_args, **multi_layer_args}) elif model_name == "textcnn": model_args = yaml.load(open('./configs/textcnn.yml'), Loader=yaml.FullLoader) model = TextCNN(vocab_size, embeddings, **{**model_args, **multi_layer_args}) elif model_name == "textrnn": model_args = yaml.load(open('./configs/textrnn.yml'), Loader=yaml.FullLoader) model = TextRNN(vocab_size, embeddings, **{**model_args, **multi_layer_args}) elif model_name == "attention_rnn": model_args = yaml.load(open('./configs/attention_rnn.yml'), Loader=yaml.FullLoader) model = AttentionRNN(vocab_size, embeddings, **{**model_args, **multi_layer_args}) elif model_name == "transformer": model_args = yaml.load(open('./configs/transformer.yml'), Loader=yaml.FullLoader) model = Transformer(vocab_size, embeddings, **{**model_args, **multi_layer_args}) else: model_args = yaml.load(open('./configs/fasttext.yml'), Loader=yaml.FullLoader) model = FastText(vocab_size, embeddings, **{**model_args, **multi_layer_args}) logger = get_logger(__name__) logger.info("A model of {} is instantiated.".format(model.__class__.__name__)) return model
def __init__(self): super(Net, self).__init__() self.pointfeaturer = DGCNN(cfg.PGM.FEATURES, cfg.PGM.NEIGHBORSNUM, cfg.PGM.FEATURE_EDGE_CHANNEL) self.gnn_layer = cfg.PGM.GNN_LAYER for i in range(self.gnn_layer): if i == 0: gnn_layer = Siamese_Gconv( cfg.PGM.FEATURE_NODE_CHANNEL + cfg.PGM.FEATURE_EDGE_CHANNEL, cfg.PGM.GNN_FEAT) else: gnn_layer = Siamese_Gconv(cfg.PGM.GNN_FEAT, cfg.PGM.GNN_FEAT) self.add_module('gnn_layer_{}'.format(i), gnn_layer) self.add_module('affinity_{}'.format(i), Affinity(cfg.PGM.GNN_FEAT)) if cfg.PGM.USEATTEND == 'attentiontransformer': self.add_module( 'gmattend{}'.format(i), Transformer(2 * cfg.PGM.FEATURE_EDGE_CHANNEL if i == 0 else cfg.PGM.GNN_FEAT)) self.add_module('InstNorm_layer_{}'.format(i), nn.InstanceNorm2d(1, affine=True)) if i == self.gnn_layer - 2: # only second last layer will have cross-graph module self.add_module( 'cross_graph_{}'.format(i), nn.Linear(cfg.PGM.GNN_FEAT * 2, cfg.PGM.GNN_FEAT))
def build_model(configs, dataset): if configs.model == 'transformer': model = Transformer(configs, dataset) elif configs.model == 'nplm': model = NPLM(configs, dataset) return model
def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool='cls', channels=3, dim_head=64, dropout=0., emb_dropout=0.): """ image_size: 输入的图片大小 patch_size: 分块大小 num_classes: 分类数目 dim: Transformer Encoder输出的维度 depth: 层数 dropout: dropout rate in FFN emb_dropout: dropout rate in Patch Projection pool: cls token pooling or mean pooling """ super().__init__() image_height, image_width = pair(image_size) patch_height, patch_width = pair(patch_size) assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.' num_patches = (image_height // patch_height) * (image_width // patch_width) patch_dim = channels * patch_height * patch_width assert pool in { 'cls', 'mean' }, 'pool type must be either cls (cls token) or mean (mean pooling)' self.to_patch_embedding = nn.Sequential( Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width), nn.Linear(patch_dim, dim), ) self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim)) self.cls_token = nn.Parameter(torch.randn(1, 1, dim)) self.dropout = nn.Dropout(emb_dropout) self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout) self.pool = pool self.to_latent = nn.Identity() self.mlp_head = nn.Sequential(nn.LayerNorm(dim), nn.Linear(dim, num_classes))
def create_model(weights): backbone = build_backbone() transformer = Transformer(d_model=256, return_intermediate_dec=True) model = DETR(backbone, transformer, num_classes=91, num_queries=100) checkpoint = torch.load(weights, map_location='cpu')['model'] model.load_state_dict(checkpoint) return model
def _make_detr(backbone_name: str, dilation=False, num_classes=91, mask=False): hidden_dim = 256 backbone = Backbone(backbone_name, train_backbone=True, return_interm_layers=mask, dilation=dilation) pos_enc = PositionEmbeddingSine(hidden_dim // 2, normalize=True) backbone_with_pos_enc = Joiner(backbone, pos_enc) backbone_with_pos_enc.num_channels = backbone.num_channels transformer = Transformer(d_model=hidden_dim, return_intermediate_dec=True) detr = DETR(backbone_with_pos_enc, transformer, num_classes=num_classes, num_queries=100) if mask: return DETRsegm(detr) return detr
def build_transformer(hidden_dim, dropout, nheads, dim_feedforward, enc_layers, dec_layers, pre_norm): return Transformer( d_model=hidden_dim, dropout=dropout, nhead=nheads, dim_feedforward=dim_feedforward, num_encoder_layers=enc_layers, num_decoder_layers=dec_layers, normalize_before=pre_norm, return_intermediate_dec=True, )
def test_transformer(self): # this should be small GPT-2, but the param count is wrong # (real ff_dim is 768*4) model = Transformer(syms=10, maxlen=6, layers=12, embed_dim=768, num_heads=12, ff_dim=768 // 4) X = np.zeros((BS, 6), dtype=np.float32) Y = np.zeros((BS, 6), dtype=np.int32) train_one_step(model, X, Y)
def __make_transformer_top(self,x,verbose=False): h = Conv2D(self.hidden_dim,kernel_size=1,strides=1, padding='same',kernel_initializer='he_normal', use_bias=True,data_format='channels_last')(x) if verbose: print('h',h.shape) if tf.__version__ < "2.0.0": H,W = h.shape[1].value,h.shape[2].value else: H,W = h.shape[1],h.shape[2] if verbose: print('H,W',H,W) query_pos = self.get_trainable_parameter(shape=(self.n_query_pos, self.hidden_dim)) row_embed = self.get_trainable_parameter(shape=(100, self.hidden_dim // 2)) col_embed = self.get_trainable_parameter(shape=(100, self.hidden_dim // 2)) cat1_col = tf.expand_dims(col_embed[:W], 0) cat1_col = tf.repeat(cat1_col, H, axis=0) if verbose: print('col_embed',cat1_col.shape) cat2_row = tf.expand_dims(row_embed[:H], 1) cat2_row = tf.repeat(cat2_row, W, axis=1) if verbose: print('row_embed',cat2_row.shape) pos = tf.concat([cat1_col,cat2_row],axis=-1) if tf.__version__ < "2.0.0": pos = tf.expand_dims(tf.reshape(pos,[pos.shape[0].value*pos.shape[1].value,-1]),0) else: pos = tf.expand_dims(tf.reshape(pos,[pos.shape[0]*pos.shape[1],-1]),0) h = tf.reshape(h,[-1, h.shape[1]*h.shape[2],h.shape[3]]) temp_input = pos+h h_tag = tf.transpose(h,perm=[0, 2, 1]) if verbose: print('h_tag transpose1',h_tag.shape) h_tag = Conv1D(query_pos.shape[0],kernel_size=1,strides=1, padding='same',kernel_initializer='he_normal', use_bias=True,data_format='channels_last')(h_tag) if verbose: print('h_tag conv',h_tag.shape) h_tag = tf.transpose(h_tag,perm=[0, 2, 1]) if verbose: print('h_tag transpose2',h_tag.shape) query_pos = tf.expand_dims(query_pos,0) if verbose: print('query_pos',query_pos.shape) query_pos+=h_tag query_pos-=h_tag self.transformer = Transformer( d_model=self.hidden_dim, nhead=self.nheads, num_encoder_layers=self.num_encoder_layers, num_decoder_layers=self.num_decoder_layers) atten_out, attention_weights = self.transformer(temp_input, query_pos) return atten_out
def __init__(self, vocabulary_size=hp.VOCABULARY_SIZE, embedding_size=hp.EMBEDDING_SIZE, number_of_properties=hp.NUMBER_OF_PROPERTIES, padding_index=hp.PADDING_INDEX, model_dimension=hp.MODEL_DIMENSION, target_sequence_length=hp.TARGET_SEQUENCE_LENGTH, dropout_probability=hp.DROPOUT_PROBABILITY, feed_forward_transformer_layer_dimension=hp.FEED_FORWARD_TRANSFORMER_LAYER_DIMENSION, learning_rate=hp.LEARNING_RATE): super().__init__() self.transformer = Transformer(vocabulary_size, embedding_size, number_of_properties, padding_index, model_dimension, target_sequence_length, dropout_probability, feed_forward_transformer_layer_dimension) self.loss_function = nn.CrossEntropyLoss(ignore_index=padding_index) self.learning_rate = learning_rate relu_recommended_gain = nn.init.calculate_gain('relu') for parameter in self.transformer.parameters(): if parameter.dim() > 1: nn.init.xavier_uniform_(parameter, gain=relu_recommended_gain) else: nn.init.normal_(parameter, std=0.1) self.train_name = "train" self.validation_name = "validation"
def construct_model(model_type: str, weight_matrix: np.ndarray) -> torch.nn.Module: if model_type == 'mlp': model = MLP(num_embeddings=weight_matrix.shape[0], embedding_matrix=weight_matrix) elif model_type == 'transformer': model = Transformer(num_embeddings=weight_matrix.shape[0], embedding_matrix=weight_matrix) elif model_type == 'lstm': model = ResBiLSTM(num_embeddings=weight_matrix.shape[0], embedding_matrix=weight_matrix) else: model = None return model
class TransformerLightning(pl.LightningModule): def __init__(self, vocabulary_size=hp.VOCABULARY_SIZE, embedding_size=hp.EMBEDDING_SIZE, number_of_properties=hp.NUMBER_OF_PROPERTIES, padding_index=hp.PADDING_INDEX, model_dimension=hp.MODEL_DIMENSION, target_sequence_length=hp.TARGET_SEQUENCE_LENGTH, dropout_probability=hp.DROPOUT_PROBABILITY, feed_forward_transformer_layer_dimension=hp.FEED_FORWARD_TRANSFORMER_LAYER_DIMENSION, learning_rate=hp.LEARNING_RATE): super().__init__() self.transformer = Transformer(vocabulary_size, embedding_size, number_of_properties, padding_index, model_dimension, target_sequence_length, dropout_probability, feed_forward_transformer_layer_dimension) self.loss_function = nn.CrossEntropyLoss(ignore_index=padding_index) self.learning_rate = learning_rate relu_recommended_gain = nn.init.calculate_gain('relu') for parameter in self.transformer.parameters(): if parameter.dim() > 1: nn.init.xavier_uniform_(parameter, gain=relu_recommended_gain) else: nn.init.normal_(parameter, std=0.1) self.train_name = "train" self.validation_name = "validation" def forward(self, batched_source_numericalized, batched_source_properties, batched_source_padding_mask): pass def training_step(self, batch, batch_idx): return self._step(batch, batch_idx, self.train_name) def validation_step(self, batch, batch_idx): return self._step(batch, batch_idx, self.validation_name) def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=self.learning_rate) def _step(self, batch, batch_idx, mode): bsn, bsp, btin, bspm, btipm, bton = batch logits = self.transformer(bsn, bsp, btin, bspm, btipm) # logits -> batch_size x target_sequence_length x vocabulary_size # bton -> batch_size x target_sequence_length([0, vocabulary_size-1]) loss = self.loss_function(rearrange(logits, 'b t v -> b v t'), bton) on_step = True if mode == self.train_name else False self.log(f'{mode}_loss', loss, on_step=on_step, on_epoch=True, prog_bar=True, logger=True) return loss
def __init__(self, img_model, seq_model): super().__init__() self.img_model, self.seq_model = None, None if img_model == "slow_fusion": from models.slow_fusion import SlowFusion self.img_model = SlowFusion(3, 10, 64) elif img_model == "early_fusion": from models.early_fusion import EarlyFusion self.img_model = EarlyFusion(3, 10, 64) elif img_model == "late_fusion": from models.late_fusion import LateFusion self.img_model = LateFusion(3, 10, 64) elif img_model == "vanilla_cnn": from models.basic_cnn import BasicCNN self.img_model = BasicCNN(3, 64) else: from models.imagenet_model_wrapper import ImageNet_Model_Wrapper self.img_model = ImageNet_Model_Wrapper(img_model) if seq_model == "vanilla_rnn": from models.rnn import RNN self.seq_model = RNN(512, 256, 2) elif seq_model == "lstm": from models.lstm import LSTM self.seq_model = LSTM(512, 256, num_layers=2, dropout=0.1, bidirectional=True) elif seq_model == "lstmn": from models.lstmn import BiLSTMN self.seq_model = BiLSTMN(512, 256, num_layers=2, dropout=0.1, tape_depth=10) elif seq_model == "transformer_abs": from models.transformer import Transformer self.seq_model = Transformer(512, 8) elif seq_model == "stack_lstm": from models.stack_lstm import EncoderLSTMStack self.seq_model = EncoderLSTMStack(512, 256) # attention over seq_model output self.query_vector = nn.Parameter(torch.randn(1, 64)) # self.attn_w = nn.Bilinear(64, 512, 1) self.attn_w = nn.Parameter(torch.randn(64, 512)) self.linear1 = nn.Linear(512, 32) self.linear2 = nn.Linear(32, 1)
def get_models( vocab_size, # 词典大小 n_class=10, # 类别个数 seq_len=38, # 句子长度 device=None): # 设备 """ 获取所有需要训练的模型 """ if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') fast_text = FastText(vocab_size=vocab_size, n_class=n_class) text_cnn = TextCNN(vocab_size=vocab_size, n_class=n_class) text_rnn = TextRNN(vocab_size=vocab_size, n_class=n_class) text_rcnn = TextRCNN(vocab_size=vocab_size, n_class=n_class) transformer = Transformer(vocab_size=vocab_size, seq_len=seq_len, n_class=n_class, device=device) return [fast_text, text_cnn, text_rnn, text_rcnn, transformer]
def __init__(self, body, num_classes=90, num_queries=100, aux_loss=True, num_channels=512, hidden_dim=64, dropout=.1, nheads=8, dim_feedforward=256, enc_layers=2, dec_layers=2, pre_norm=False, return_intermediate_dec=True, position_embedding=None): backbone = Backbone(body=body) N_steps = hidden_dim // 2 position_embedding = position_embedding if position_embedding is not None else PositionEmbeddingSine( N_steps, normalize=True) model = Joiner(backbone, position_embedding) model.num_channels = num_channels transformer = Transformer( d_model=hidden_dim, dropout=dropout, nhead=nheads, dim_feedforward=dim_feedforward, num_encoder_layers=enc_layers, num_decoder_layers=dec_layers, normalize_before=pre_norm, return_intermediate_dec=True, ) super().__init__(model, transformer, num_classes=num_classes, num_queries=num_queries, aux_loss=aux_loss)