def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training GPT2...') # Use fake train data batch_size = args.batch_size sequence_length = args.sequence_length hidden_size = args.hidden_size vocab_size = args.vocab_size dropout_prob = args.hidden_dropout input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}') print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}') def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=dropout_prob, init_method=init_method_normal) optimizer = torch.optim.SGD(embedding.parameters(), lr=0.01) profiler = Profiler(os.path.join('benchmark', args.exp_name)) num_epochs = 5 tot_time = 0 nproc = torch.distributed.get_world_size() for epoch in range(num_epochs): overall_name = f'emb_np-{nproc}_vs-{vocab_size}' profiler.start(overall_name) # Forward pass profiler.start(f'emb_forward_np-{nproc}_vs-{vocab_size}') embedding_output = embedding.forward(input_indices, position_indices) train_loss = torch.mean(embedding_output) torch.cuda.synchronize() profiler.stop(f'emb_forward_np-{nproc}_vs-{vocab_size}') # Backward pass profiler.start(f'emb_backward_np-{nproc}_vs-{vocab_size}') optimizer.zero_grad() train_loss.backward() optimizer.step() torch.cuda.synchronize() profiler.stop(f'emb_backward_np-{nproc}_vs-{vocab_size}') profiler.stop(overall_name)
def __init__(self, dataset, config): super(Classifier, self).__init__() self.config = config self.token_embedding = \ Embedding(dataset.token_map, config.embedding.dimension, cDataset.DOC_TOKEN, config, dataset.VOCAB_PADDING, pretrained_embedding_file= config.feature.token_pretrained_file, mode=EmbeddingProcessType.FLAT, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, activation_type=ActivationType.NONE, model_mode=dataset.model_mode) self.char_embedding = \ Embedding(dataset.char_map, config.embedding.dimension, cDataset.DOC_CHAR, config, dataset.VOCAB_PADDING, mode=EmbeddingProcessType.FLAT, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, activation_type=ActivationType.NONE, model_mode=dataset.model_mode) self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout)
def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training GPT2...') # Use fake train data args = get_args() sequence_length = 1024 vocab_size = 4096 dropout_prob = 0.1 input_indices = torch.randint(low=0, high=vocab_size, size=(args.batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (args.batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}') print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}') def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=args.hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=dropout_prob, init_method=init_method_normal) embedding_output = embedding.forward(input_indices, position_indices) # print_rank_0(f'AutoMP: embedding_output = {embedding_output}') def gpt2_attention_mask_func(attention_scores, ltor_mask): attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores transformer = ParallelTransformer( attention_mask_func=gpt2_attention_mask_func, num_layers=args.num_layers, hidden_size=args.hidden_size, layernorm_epsilon=args.layernorm_epsilon, num_attention_heads=args.num_attention_heads, attention_dropout=0.1, hidden_dropout=0.1) attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( input_indices, vocab_size - 1) transformer_output = transformer.forward(hidden_states=embedding_output, attention_mask=attention_mask) print_rank_0(f'AutoMP: transformer_output = {transformer_output}')
def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training self attention layer...') # Use fake train data args = get_args() batch_size = 32 sequence_length = 1024 hidden_size = args.hidden_size vocab_size = 4096 dropout_prob = 0.1 input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}') print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}') def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=dropout_prob, init_method=init_method_normal) embedding_output = embedding.forward(input_indices, position_indices) # print_rank_0(f'AutoMP: embedding_output = {embedding_output}') def gpt2_attention_mask_func(attention_scores, ltor_mask): print(f'ALBERT_DEBUG: attention_scores.size() = {attention_scores.size()}') print(f'ALBERT_DEBUG: ltor_mask.size() = {ltor_mask.size()}') attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores self_attention = ParallelSelfAttention( attention_mask_func=gpt2_attention_mask_func, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, attention_dropout=0.1 ) attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1) print(f'ALBERT_DEBUG: embedding_output.size() = {embedding_output.size()}') self_att_output = self_attention.forward(hidden_states=embedding_output, attention_mask=attention_mask) print_rank_0(f'AutoMP: self_att_output = {self_att_output}')
def __init__(self, dataset, config): super(FastText, self).__init__() self.config = config assert "token" in self.config.feature.feature_names self.token_embedding = \ Embedding(dataset.token_map, config.embedding.dimension, cDataset.DOC_TOKEN, config, padding_idx=dataset.VOCAB_PADDING, pretrained_embedding_file= config.feature.token_pretrained_file, mode=EmbeddingProcessType.SUM, dropout=0, init_type=config.embedding.initializer, low=-config.embedding.uniform_bound, high=config.embedding.uniform_bound, std=config.embedding.random_stddev, activation_type=ActivationType.NONE) if self.config.feature.token_ngram > 1: self.token_ngram_embedding = \ Embedding(dataset.token_ngram_map, config.embedding.dimension, cDataset.DOC_TOKEN_NGRAM, config, padding_idx=dataset.VOCAB_PADDING, mode=EmbeddingProcessType.SUM, dropout=0, init_type=config.embedding.initializer, low=-config.embedding.uniform_bound, high=config.embedding.uniform_bound, std=config.embedding.random_stddev, activation_type=ActivationType.NONE) if "keyword" in self.config.feature.feature_names: self.keyword_embedding = \ Embedding(dataset.keyword_map, config.embedding.dimension, cDataset.DOC_KEYWORD, config, padding_idx=dataset.VOCAB_PADDING, pretrained_embedding_file= config.feature.keyword_pretrained_file, mode=EmbeddingProcessType.SUM, dropout=0, init_type=config.embedding.initializer, low=-config.embedding.uniform_bound, high=config.embedding.uniform_bound, std=config.embedding.random_stddev, activation_type=ActivationType.NONE) if "topic" in self.config.feature.feature_names: self.topic_embedding = \ Embedding(dataset.topic_map, config.embedding.dimension, cDataset.DOC_TOPIC, config, padding_idx=dataset.VOCAB_PADDING, mode=EmbeddingProcessType.SUM, dropout=0, init_type=config.embedding.initializer, low=-config.embedding.uniform_bound, high=config.embedding.uniform_bound, std=config.embedding.random_stddev, activation_type=ActivationType.NONE) self.linear = torch.nn.Linear(config.embedding.dimension, len(dataset.label_map)) self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout)
def __init__(self, embedding_info: Dict, encoder_info: Dict, decoder_info: Dict, hidden_states: Dict, token_to_id: Dict, type_to_id: Dict, label_to_id: Dict): super().__init__() self.embedding_info = embedding_info self.encoder_info = encoder_info self.decoder_info = decoder_info self.hidden_states = hidden_states self.token_to_id = token_to_id self.type_to_id = type_to_id self.label_to_id = label_to_id self.embedding = Embedding(h_emb=self.hidden_states['embedding'], token_to_id=self.token_to_id, type_to_id=self.type_to_id, **self.embedding_info) self.encoder = Encoder(h_emb=self.hidden_states['embedding'], h_enc=self.hidden_states['encoder'], **self.encoder_info) self.decoder = Decoder(h_enc=self.hidden_states['encoder'], h_dec=self.hidden_states['decoder'], label_to_id=self.label_to_id, **self.decoder_info)
def __init__(self, hidden_size, vocab_size, sequence_length, hidden_dropout, attention_mask_func, num_layers, layernorm_epsilon, num_attention_heads, attention_dropout, init_method): super(TransformerLanguageModel, self).__init__() # self.hidden_size = hidden_size # self.vocab_size = vocab_size # self.sequence_length = sequence_length # self.hidden_dropout = hidden_dropout # self.init_method = init_method # self.num_layers = num_layers # self.layernorm_epsilon = layernorm_epsilon # self.num_attention_heads = num_attention_heads # self.attention_dropout = attention_dropout # Embeddings self.embedding = Embedding(hidden_size, vocab_size, sequence_length, hidden_dropout, init_method) self._embedding_key = 'embedding' # Transformer self.transformer = ParallelTransformer(attention_mask_func, num_layers, hidden_size, layernorm_epsilon, num_attention_heads, attention_dropout, hidden_dropout) self._transformer_key = 'transformer'
def get_embedding(self, embed_id): """ Return the actual word embedding associated with a given ID """ if not embed_id in self.embedding_meta: return None if embed_id in self.embedding_cache: log.info("Using cached embedding for %s" % embed_id) return self.embedding_cache[embed_id] # load the associated word embedding em = self.embedding_meta[embed_id] in_path = em.dir_base / em["file"] log.info("Loading word embedding from %s" % in_path) try: self.embedding_cache[embed_id] = Embedding(in_path) except Exception as e: log.warning("Failed to load word embedding: %s" % in_path) log.warning(e) return None return self.embedding_cache[embed_id]
def create_model(self, fetch_data=None): with tf.variable_scope('variables'): sentence_simple_input_placeholder = [] sentence_complex_input_placeholder = [] if self.model_config.subword_vocab_size and self.model_config.seg_mode: sentence_simple_segment_input_placeholder = [] sentence_complex_segment_input_placeholder = [] obj = {} if fetch_data is not None and self.model_config.fetch_mode == 'tf_example_dataset': for t in tf.unstack(fetch_data['line_comp_ids'], axis=1): sentence_complex_input_placeholder.append(t) for t in tf.unstack(fetch_data['line_simp_ids'], axis=1): sentence_simple_input_placeholder.append(t) if self.model_config.subword_vocab_size and self.model_config.seg_mode: for t in tf.unstack(fetch_data['line_comp_segids'], axis=1): sentence_complex_segment_input_placeholder.append(t) for t in tf.unstack(fetch_data['line_simp_segids'], axis=1): sentence_simple_segment_input_placeholder.append(t) obj['line_comp_segids'] = tf.stack( sentence_complex_segment_input_placeholder, axis=1) obj['line_simp_segids'] = tf.stack( sentence_simple_segment_input_placeholder, axis=1) score = None if self.model_config.tune_style: if self.is_train: # In training, score are from fetch data scores = [] if self.model_config.tune_style[0]: ppdb_score = fetch_data['ppdb_score'] scores.append(ppdb_score) print('Tune ppdb score!') if 'plus' in self.model_config.tune_mode: # to avoid most ppdb scores are 0 ppdb_score += 0.1 if self.model_config.tune_style[1]: add_score = fetch_data['dsim_score'] scores.append(add_score) print('Tune dsim_score score!') if self.model_config.tune_style[2]: add_score = fetch_data['add_score'] scores.append(add_score) print('Tune add score!') if self.model_config.tune_style[3]: len_score = fetch_data['len_score'] scores.append(len_score) print('Tune length score!') else: # In evaluating/predict, scores may be a factor to multiply if in pred mode # or actual user provided score # TODO(sanqiang): not used for now because not fech_data in eval raise NotImplementedError('No tune style for training') # ppdb_score = tf.constant( # self.model_config.tune_style[0], shape=[self.model_config.batch_size], dtype=tf.float32) # add_score = tf.constant( # self.model_config.tune_style[1], shape=[self.model_config.batch_size], dtype=tf.float32) # len_score = tf.constant( # self.model_config.tune_style[2], shape=[self.model_config.batch_size], dtype=tf.float32) # Assemble scores dimension_unit = int(self.model_config.dimension / len(scores)) dimension_runit = self.model_config.dimension - ( len(scores) - 1) * dimension_unit for s_i, score in enumerate(scores): if s_i < len(scores) - 1: scores[s_i] = tf.expand_dims(tf.tile( tf.expand_dims(scores[s_i], axis=-1), [1, dimension_unit]), axis=1) else: scores[s_i] = tf.expand_dims(tf.tile( tf.expand_dims(scores[s_i], axis=-1), [1, dimension_runit]), axis=1) score = tf.concat(scores, axis=-1) else: for step in range(self.model_config.max_simple_sentence): sentence_simple_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='simple_input')) for step in range(self.model_config.max_complex_sentence): sentence_complex_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='complex_input')) if self.model_config.subword_vocab_size and self.model_config.seg_mode: for step in range(self.model_config.max_simple_sentence): sentence_simple_segment_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='simple_seg_input')) for step in range(self.model_config.max_complex_sentence): sentence_complex_segment_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='complex_seg_input')) obj['line_comp_segids'] = tf.stack( sentence_complex_segment_input_placeholder, axis=1) obj['line_simp_segids'] = tf.stack( sentence_simple_segment_input_placeholder, axis=1) score = None if self.model_config.tune_style: if self.is_train: raise NotImplementedError('No tune style for training') # # ppdb_score = tf.constant( # self.model_config.tune_style, shape=[self.model_config.batch_size], dtype=tf.float32) # ppdb_score = tf.expand_dims(tf.tile( # tf.expand_dims(ppdb_score, axis=-1), # [1, self.model_config.dimension]), axis=1) else: scores = [] if self.model_config.tune_style: if self.model_config.tune_style[0]: ppdb_score = tf.constant( self.model_config.tune_style[0], shape=[self.model_config.batch_size], dtype=tf.float32) scores.append(ppdb_score) print('tune ppdb score') if self.model_config.tune_style[1]: dsim_score = tf.constant( self.model_config.tune_style[1], shape=[self.model_config.batch_size], dtype=tf.float32) scores.append(dsim_score) print('tune dsim score') if self.model_config.tune_style[2]: add_score = tf.constant( self.model_config.tune_style[2], shape=[self.model_config.batch_size], dtype=tf.float32) scores.append(add_score) print('tune add score') if self.model_config.tune_style[3]: len_score = tf.constant( self.model_config.tune_style[3], shape=[self.model_config.batch_size], dtype=tf.float32) scores.append(len_score) print('tune len score') # Assemble scores dimension_unit = int(self.model_config.dimension / len(scores)) dimension_runit = self.model_config.dimension - ( len(scores) - 1) * dimension_unit for s_i, score in enumerate(scores): if s_i < len(scores) - 1: scores[s_i] = tf.expand_dims(tf.tile( tf.expand_dims(scores[s_i], axis=-1), [1, dimension_unit]), axis=1) else: scores[s_i] = tf.expand_dims(tf.tile( tf.expand_dims(scores[s_i], axis=-1), [1, dimension_runit]), axis=1) score = tf.concat(scores, axis=-1) # For self.model_config.tune_style: comp_features = {} comp_add_score = tf.zeros(self.model_config.batch_size, tf.float32, name='comp_add_score_input') comp_length = tf.zeros(self.model_config.batch_size, tf.float32, name='comp_length_input') comp_features['comp_add_score'] = comp_add_score comp_features['comp_length'] = comp_length sentence_idxs = tf.zeros(self.model_config.batch_size, tf.int32, name='sent_idx') self.embedding = Embedding(self.data.vocab_complex, self.data.vocab_simple, self.model_config) if self.model_config.bert_mode: emb_complex = None else: emb_complex = self.embedding.get_complex_embedding() if self.model_config.bert_mode and ( self.model_config.tie_embedding == 'all' or self.model_config.tie_embedding == 'enc_dec'): emb_simple = None else: emb_simple = self.embedding.get_simple_embedding() if (self.is_train and self.model_config.pretrained_embedding): self.embed_complex_placeholder = tf.placeholder( tf.float32, (self.data.vocab_complex.vocab_size(), self.model_config.dimension), 'complex_emb') self.replace_emb_complex = emb_complex.assign( self.embed_complex_placeholder) self.embed_simple_placeholder = tf.placeholder( tf.float32, (self.data.vocab_simple.vocab_size(), self.model_config.dimension), 'simple_emb') self.replace_emb_simple = emb_simple.assign( self.embed_simple_placeholder) if self.model_config.bert_mode and ( self.model_config.tie_embedding == 'all' or self.model_config.tie_embedding == 'dec_out'): w = None else: w = self.embedding.get_w() b = self.embedding.get_b() mem_contexts, mem_outputs, mem_counter = None, None, None rule_id_input_placeholder, rule_target_input_placeholder = [], [] if 'rule' in self.model_config.memory: with tf.device('/cpu:0'): context_size = 0 if self.model_config.framework == 'transformer': context_size = 1 elif self.model_config.framework == 'seq2seq': context_size = 2 mem_contexts = tf.get_variable( 'mem_contexts', initializer=tf.constant( 0, dtype=tf.float32, shape=(self.data.vocab_rule.get_rule_size(), self.model_config.max_target_rule_sublen, self.model_config.dimension * context_size)), trainable=False, dtype=tf.float32) mem_outputs = tf.get_variable( 'mem_outputs', initializer=tf.constant( 0, dtype=tf.float32, shape=(self.data.vocab_rule.get_rule_size(), self.model_config.max_target_rule_sublen, self.model_config.dimension)), trainable=False, dtype=tf.float32) mem_counter = tf.get_variable( 'mem_counter', initializer=tf.constant( 0, dtype=tf.int32, shape=(self.data.vocab_rule.get_rule_size(), 1)), trainable=False, dtype=tf.int32) if 'direct' in self.model_config.memory or 'rule' in self.model_config.memory: if fetch_data is not None and self.model_config.fetch_mode == 'tf_example_dataset': for t in tf.unstack(fetch_data['rule_id'], axis=1): rule_id_input_placeholder.append(t) for t in tf.unstack(fetch_data['rule_target'], axis=1): rule_target_input_placeholder.append(t) else: for step in range(self.model_config.max_cand_rules): rule_id_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='rule_id_input')) for step in range(self.model_config.max_cand_rules): if 'direct' in self.model_config.memory: rule_target_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='rule_target_input')) elif 'rule' in self.model_config.memory: rule_target_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.string, name='rule_target_input')) with tf.variable_scope('model'): output = self.model_fn(sentence_complex_input_placeholder, emb_complex, sentence_simple_input_placeholder, emb_simple, w, b, rule_id_input_placeholder, rule_target_input_placeholder, mem_contexts, mem_outputs, self.global_step, score, comp_features, obj) encoder_embs, final_outputs = None, None if self.model_config.replace_unk_by_emb: encoder_embs = tf.stack(output.encoder_embed_inputs_list, axis=1) if output.decoder_outputs_list is not None: if type(output.decoder_outputs_list) == list: decoder_outputs_list = output.decoder_outputs_list decoder_outputs = tf.stack(decoder_outputs_list, axis=1) else: decoder_outputs = output.decoder_outputs_list if output.final_outputs_list is not None: if type(output.final_outputs_list) == list: final_outputs_list = output.final_outputs_list final_outputs = tf.stack(final_outputs_list, axis=1) else: final_outputs = output.final_outputs_list attn_distr = None if self.model_config.replace_unk_by_attn: attn_distr = output.attn_distr_list if not self.is_train: # in beam search, it directly provide decoder target list decoder_target = tf.stack(output.decoder_target_list, axis=1) loss = tf.reduce_mean(output.decoder_score) obj = { 'sentence_idxs': sentence_idxs, 'sentence_simple_input_placeholder': sentence_simple_input_placeholder, 'sentence_complex_input_placeholder': sentence_complex_input_placeholder, 'decoder_target_list': decoder_target, 'final_outputs': final_outputs, 'encoder_embs': encoder_embs, 'attn_distr': attn_distr } if self.model_config.subword_vocab_size and self.model_config.seg_mode: obj['sentence_complex_segment_input_placeholder'] = sentence_complex_segment_input_placeholder obj['sentence_simple_segment_input_placeholder'] = sentence_simple_segment_input_placeholder if 'rule' in self.model_config.memory or 'direct' in self.model_config.memory: obj['rule_id_input_placeholder'] = rule_id_input_placeholder obj['rule_target_input_placeholder'] = rule_target_input_placeholder if self.model_config.tune_style: obj['comp_features'] = comp_features return loss, obj else: # Memory Populate if 'rule' in self.model_config.memory: # Update Memory through python injection def update_memory(mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp, decoder_targets, decoder_outputs, contexts, rule_target_input_placeholder, rule_id_input_placeholder, global_step, encoder_outputs): def _seq_contain(arr, tar): j = 0 for i in range(len(arr)): if arr[i] == tar[j]: j += 1 if j == len(tar): return i - len(tar) + 1 else: j = 0 return -1 # if 'stopgrad' in self.model_config.rl_configs and global_step % 2 != 0: # return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp # if global_step <= self.model_config.memory_prepare_step: # return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp batch_size = np.shape(rule_target_input_placeholder)[0] max_rules = np.shape(rule_target_input_placeholder)[1] decoder_targets_str = [ ' '.join(sent) for sent in truncate_sents( decode( decoder_targets, self.data.vocab_simple, self.model_config.subword_vocab_size > 0 or 'bert_token' in self.model_config.bert_mode)) ] for batch_id in range(batch_size): cur_decoder_targets = decoder_targets[batch_id, :] cur_decoder_targets_str = decoder_targets_str[ batch_id] cur_decoder_outputs = decoder_outputs[batch_id, :] cur_contexts = contexts[batch_id, :] cur_rule_target_input_placeholder = rule_target_input_placeholder[ batch_id, :] cur_rule_target_input_placeholder = [ tmp.decode("utf-8").strip('\x00') for tmp in cur_rule_target_input_placeholder if not tmp.decode("utf-8").strip().startswith( constant.SYMBOL_PAD) ] cur_rule_id_input_placeholder = rule_id_input_placeholder[ batch_id, :] # Build the valid mapper from rule id => target words ids rule_mapper = {} for step in range( len(cur_rule_target_input_placeholder)): rule_target_str = cur_rule_target_input_placeholder[ step] if rule_target_str == constant.SYMBOL_PAD: continue rule_id = cur_rule_id_input_placeholder[step] if rule_id != 0 and re.search( r'\b%s\b' % rule_target_str, cur_decoder_targets_str ): # decoder_target_str in cur_decoder_targets_str: decoder_target_wids = self.data.vocab_simple.encode( rule_target_str) dec_s_idx = _seq_contain( cur_decoder_targets, decoder_target_wids) if dec_s_idx != -1: print('rule_target_str:%s' % rule_target_str) print('cur_decoder_targets_str:%s' % cur_decoder_targets_str) print('cur_decoder_targets:%s' % cur_decoder_targets) print('decoder_target_wids:%s' % decoder_target_wids) rule_mapper[rule_id] = list( range( dec_s_idx, dec_s_idx + len(decoder_target_wids))) for rule_id in rule_mapper: dec_idxs = rule_mapper[rule_id] for idx, dec_idx in enumerate(dec_idxs): if mem_counter_tmp[rule_id, 0] == 0: mem_contexts_tmp[ rule_id, idx, :] = cur_contexts[dec_idx, :] mem_outputs_tmp[ rule_id, idx, :] = cur_decoder_outputs[ dec_idx, :] else: mem_contexts_tmp[rule_id, idx, :] = ( cur_contexts[dec_idx, :] + mem_contexts_tmp[rule_id, idx, :]) / 2 mem_outputs_tmp[rule_id, idx, :] = ( cur_decoder_outputs[dec_idx, :] + mem_outputs_tmp[rule_id, idx, :]) / 2 mem_counter_tmp[rule_id, 0] += 1 return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp mem_output_input = None if 'mofinal' in self.model_config.memory_config: mem_output_input = final_outputs # elif 'modecode' in self.model_config.memory_config: # mem_output_input = decoder_outputs # elif 'moemb' in self.model_config.memory_config: # mem_output_input = tf.stack( # self.embedding_fn(sentence_simple_input_placeholder, emb_simple), # axis=1) mem_contexts, mem_outputs, mem_counter = tf.py_func( update_memory, [ mem_contexts, mem_outputs, mem_counter, tf.stack(output.decoder_target_list, axis=1), mem_output_input, output.contexts, tf.stack(rule_target_input_placeholder, axis=1), tf.stack(rule_id_input_placeholder, axis=1), self.global_step, output.encoder_outputs ], [tf.float32, tf.float32, tf.int32], stateful=False, name='update_memory') #Loss and corresponding prior/mask decode_word_weight_list = [ tf.to_float( tf.not_equal( d, self.data.vocab_simple.encode( constant.SYMBOL_PAD))) for d in output.gt_target_list ] decode_word_weight = tf.stack(decode_word_weight_list, axis=1) gt_target = tf.stack(output.gt_target_list, axis=1) def self_critical_loss(): # For minimize the negative log of probabilities rewards = tf.py_func( self.metric.self_crititcal_reward, [ sentence_idxs, tf.stack(output.sample_target_list, axis=-1), tf.stack(output.decoder_target_list, axis=-1), tf.stack(sentence_simple_input_placeholder, axis=-1), tf.stack(sentence_complex_input_placeholder, axis=-1), tf.ones((1, 1)), # tf.stack(rule_target_input_placeholder, axis=1) ], tf.float32, stateful=False, name='reward') rewards.set_shape((self.model_config.batch_size, self.model_config.max_simple_sentence)) rewards = tf.unstack(rewards, axis=1) weighted_probs_list = [ rewards[i] * decode_word_weight_list[i] * -output.sample_logit_list[i] for i in range(len(decode_word_weight_list)) ] total_size = tf.reduce_sum(decode_word_weight_list) total_size += 1e-12 weighted_probs = tf.reduce_sum( weighted_probs_list) / total_size loss = weighted_probs return loss def teacherforce_critical_loss(): losses = [] for step in range(self.model_config.max_simple_sentence): logit = output.decoder_logit_list[step] greedy_target_unit = tf.stop_gradient( tf.argmax(logit, axis=1)) if self.model_config.train_mode == 'teachercriticalv2': sampled_target_unit, reward = tf.py_func( self.metric.self_crititcal_reward_unitv2, [ sentence_idxs, step, greedy_target_unit, tf.stack(sentence_simple_input_placeholder, axis=-1), tf.stack( sentence_complex_input_placeholder, axis=-1), self.global_step ], [tf.int32, tf.float32], stateful=False, name='reward') reward.set_shape((self.model_config.batch_size, )) sampled_target_unit.set_shape( (self.model_config.batch_size, )) elif self.model_config.train_mode == 'teachercritical': sampled_target_unit = tf.cast( tf.squeeze(tf.multinomial(logit, 1), axis=1), tf.int32) sampled_target_unit, reward = tf.py_func( self.metric.self_crititcal_reward_unit, [ sentence_idxs, step, sampled_target_unit, greedy_target_unit, tf.stack(sentence_simple_input_placeholder, axis=-1), tf.stack( sentence_complex_input_placeholder, axis=-1), self.global_step, ], [tf.int32, tf.float32], stateful=False, name='reward') reward.set_shape((self.model_config.batch_size, )) sampled_target_unit.set_shape( (self.model_config.batch_size, )) indices = tf.stack([ tf.range(0, self.model_config.batch_size, dtype=tf.int32), tf.squeeze(sampled_target_unit) ], axis=-1) logit_unit = tf.gather_nd(tf.nn.softmax(logit, axis=1), indices) decode_word_weight = decode_word_weight_list[step] losses.append(-logit_unit * reward * decode_word_weight) loss = tf.add_n(losses) return loss def teacherforce_loss(): if self.model_config.number_samples > 0: loss_fn = tf.nn.sampled_softmax_loss else: loss_fn = None loss = sequence_loss( logits=tf.stack(output.decoder_logit_list, axis=1), targets=gt_target, weights=decode_word_weight, # softmax_loss_function=loss_fn, # w=w, # b=b, # decoder_outputs=decoder_outputs, # number_samples=self.model_config.number_samples ) return loss if self.model_config.train_mode == 'dynamic_self-critical': loss = self_critical_loss() # loss = tf.cond( # tf.greater(self.global_step, 50000), # # tf.logical_and(tf.greater(self.global_step, 100000), tf.equal(tf.mod(self.global_step, 2), 0)), # lambda : self_critical_loss(), # lambda : teacherforce_loss()) elif self.model_config.train_mode == 'teachercritical' or self.model_config.train_mode == 'teachercriticalv2': loss = tf.cond(tf.equal(tf.mod(self.global_step, 2), 0), lambda: teacherforce_loss(), lambda: teacherforce_critical_loss()) # loss = teacherforce_critical_loss() else: loss = teacherforce_loss() if self.model_config.architecture == 'ut2t': assert 'extra_encoder_loss' in output.obj_tensors and 'extra_decoder_loss' in output.obj_tensors loss += output.obj_tensors['extra_encoder_loss'] loss += output.obj_tensors['extra_decoder_loss'] print('Use U T2T with ACT') self.loss_style = tf.constant(0.0, dtype=tf.float32) if output.pred_score_tuple is not None and 'pred' in self.model_config.tune_mode: print('Create loss for predicting style') ppdb_pred_score, add_pred_score, len_pred_score = output.pred_score_tuple # ppdb_pred_score = tf.Print(ppdb_pred_score, [ppdb_pred_score, fetch_data['ppdb_score']], # message='ppdb_pred_score:', first_n=-1, summarize=100) # add_pred_score = tf.Print(add_pred_score, [add_pred_score, fetch_data['add_score']], # message='add_pred_score:', first_n=-1, summarize=100) # len_pred_score = tf.Print(len_pred_score, [len_pred_score, fetch_data['len_score']], # message='len_pred_score:', first_n=-1, summarize=100) # loss = tf.Print(loss, [loss], message='loss before:', summarize=100) self.loss_style += tf.losses.absolute_difference( ppdb_pred_score, fetch_data['ppdb_score']) self.loss_style += tf.losses.absolute_difference( add_pred_score, fetch_data['add_score']) self.loss_style += tf.losses.absolute_difference( len_pred_score, fetch_data['len_score']) loss += self.loss_style # loss = tf.Print(loss, [loss], message='loss after:', summarize=100) obj = { 'decoder_target_list': output.decoder_target_list, 'sentence_idxs': sentence_idxs, 'sentence_simple_input_placeholder': sentence_simple_input_placeholder, 'sentence_complex_input_placeholder': sentence_complex_input_placeholder, } self.logits = output.decoder_logit_list if 'rule' in self.model_config.memory: obj['rule_id_input_placeholder'] = rule_id_input_placeholder obj['rule_target_input_placeholder'] = rule_target_input_placeholder # obj['rule_pair_input_placeholder'] = rule_pair_input_placeholder obj['mem_contexts'] = mem_contexts obj['mem_outputs'] = mem_outputs obj['mem_counter'] = mem_counter return loss, obj
import numpy as np import jieba import os import sys # import thulac # thulac_seg = thulac.thulac(seg_only=True) base_path = os.path.dirname(os.path.realpath(__file__)) model = Network(None, os.path.join(base_path, 'log.nosync/network_demo/run1')) model.load_model( os.path.join(base_path, 'model_checkpoint/lstm_early_stopping_without_conc')) path = os.path.join(base_path, 'word2vec_model/wiki.zh.nosync/wiki.zh.vec') embedding = Embedding() embedding.load_w2v_model(path, False) max_steps = model.n_input_steps n_embedding = model.n_embedding categories = [ '开关语音播报', '打电话', '发短信', '发邮件', '导航', '离职倾向', 'KPI', '访问网站', '会议室预定', '设置提醒', '查日程安排', '查会议安排', '查会议室安排情况', '查月度工作任务', '查工作任务完成情况', '查月度预算执行情况', '查当月费用报销情况', '查借款情况', '查应收款', '查应付款', '查考勤', '查出差情况', '查天气', '查股票', '讲笑话', '讲故事', '讲新闻', '订机票', '订火车票' ] def run(s, verbose=False): s = s.lower()
class FastText(torch.nn.Module): """Implement fasttext classification method Reference: "Bag of Tricks for Efficient Text Classification" """ def __init__(self, dataset, config): super(FastText, self).__init__() self.config = config assert "token" in self.config.feature.feature_names self.token_embedding = \ Embedding(dataset.token_map, config.embedding.dimension, cDataset.DOC_TOKEN, config, padding_idx=dataset.VOCAB_PADDING, pretrained_embedding_file= config.feature.token_pretrained_file, mode=EmbeddingProcessType.SUM, dropout=0, init_type=config.embedding.initializer, low=-config.embedding.uniform_bound, high=config.embedding.uniform_bound, std=config.embedding.random_stddev, activation_type=ActivationType.NONE) if self.config.feature.token_ngram > 1: self.token_ngram_embedding = \ Embedding(dataset.token_ngram_map, config.embedding.dimension, cDataset.DOC_TOKEN_NGRAM, config, padding_idx=dataset.VOCAB_PADDING, mode=EmbeddingProcessType.SUM, dropout=0, init_type=config.embedding.initializer, low=-config.embedding.uniform_bound, high=config.embedding.uniform_bound, std=config.embedding.random_stddev, activation_type=ActivationType.NONE) if "keyword" in self.config.feature.feature_names: self.keyword_embedding = \ Embedding(dataset.keyword_map, config.embedding.dimension, cDataset.DOC_KEYWORD, config, padding_idx=dataset.VOCAB_PADDING, pretrained_embedding_file= config.feature.keyword_pretrained_file, mode=EmbeddingProcessType.SUM, dropout=0, init_type=config.embedding.initializer, low=-config.embedding.uniform_bound, high=config.embedding.uniform_bound, std=config.embedding.random_stddev, activation_type=ActivationType.NONE) if "topic" in self.config.feature.feature_names: self.topic_embedding = \ Embedding(dataset.topic_map, config.embedding.dimension, cDataset.DOC_TOPIC, config, padding_idx=dataset.VOCAB_PADDING, mode=EmbeddingProcessType.SUM, dropout=0, init_type=config.embedding.initializer, low=-config.embedding.uniform_bound, high=config.embedding.uniform_bound, std=config.embedding.random_stddev, activation_type=ActivationType.NONE) self.linear = torch.nn.Linear(config.embedding.dimension, len(dataset.label_map)) self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout) def get_parameter_optimizer_dict(self): params = list() params.append({'params': self.token_embedding.parameters()}) if self.config.feature.token_ngram > 1: params.append({'params': self.token_ngram_embedding.parameters()}) if "keyword" in self.config.feature.feature_names: params.append({'params': self.keyword_embedding.parameters()}) if "topic" in self.config.feature.feature_names: params.append({'params': self.topic_embedding.parameters()}) params.append({'params': self.linear.parameters()}) return params def update_lr(self, optimizer, epoch): """Update lr """ if epoch > self.config.train.num_epochs_static_embedding: for param_group in optimizer.param_groups: param_group["lr"] = self.config.optimizer.learning_rate else: for param_group in optimizer.param_groups: param_group["lr"] = 0 def forward(self, batch): doc_embedding = self.token_embedding( batch[cDataset.DOC_TOKEN].to(self.config.device), batch[cDataset.DOC_TOKEN_OFFSET].to(self.config.device)) length = batch[cDataset.DOC_TOKEN_LEN].to(self.config.device) if self.config.feature.token_ngram > 1: doc_embedding += self.token_ngram_embedding( batch[cDataset.DOC_TOKEN_NGRAM].to(self.config.device), batch[cDataset.DOC_TOKEN_NGRAM_OFFSET].to(self.config.device)) length += batch[cDataset.DOC_TOKEN_NGRAM_LEN].to( self.config.device) if "keyword" in self.config.feature.feature_names: doc_embedding += self.keyword_embedding( batch[cDataset.DOC_KEYWORD].to(self.config.device), batch[cDataset.DOC_KEYWORD_OFFSET].to(self.config.device)) length += batch[cDataset.DOC_KEYWORD_LEN].to(self.config.device) if "topic" in self.config.feature.feature_names: doc_embedding += self.topic_embedding( batch[cDataset.DOC_TOPIC].to(self.config.device), batch[cDataset.DOC_TOPIC_OFFSET].to(self.config.device)) length += batch[cDataset.DOC_TOPIC_LEN].to(self.config.device) doc_embedding /= length.resize_(doc_embedding.size()[0], 1) doc_embedding = self.dropout(doc_embedding) return self.linear(doc_embedding)
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt import time if __name__ == "__main__": dataset = SkipGram('data/rawdata.txt', 3) dataloader = torch.utils.data.DataLoader(dataset, batch_size=1024, shuffle=True) net = Embedding(len(dataset.idx_to_token), 2) optimizer = torch.optim.SGD(net.parameters(), lr=1e-1, momentum=0.9) MAX_EPOCH = 2500 print('MAX_EPOCH', MAX_EPOCH) for epoch in range(MAX_EPOCH): if (epoch + 1) % 100 == 1: start, l_sum, n = time.time(), 0.0, 0 for center_word, context_word, negative_word in dataloader: optimizer.zero_grad() l = net(center_word.view(-1, 1), context_word.view(-1, 1), negative_word.view(-1, 1)) l.backward() optimizer.step() l_sum += l.cpu().item()
def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training ParallelTransformerLayer...') batch_size = args.batch_size sequence_length = args.sequence_length hidden_size = args.hidden_size vocab_size = args.vocab_size hidden_dropout = args.hidden_dropout attention_dropout = args.attention_dropout num_layers = args.num_layers layernorm_epsilon = args.layernorm_epsilon num_attention_heads = args.num_attention_heads input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) labels = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) labels = labels.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) def gpt2_attention_mask_func(attention_scores, ltor_mask): attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=hidden_dropout, init_method=init_method_normal) embedding_output = embedding.forward(input_indices, position_indices) transformer_layer = ParallelTransformerLayer( attention_mask_func=gpt2_attention_mask_func, layer_number=0, hidden_size=hidden_size, layernorm_epsilon=layernorm_epsilon, num_attention_heads=num_attention_heads, attention_dropout=attention_dropout, hidden_dropout=hidden_dropout) # attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1) attention_mask = (torch.randint( low=0, high=2, size=(sequence_length, divide(num_attention_heads, torch.distributed.get_world_size()), batch_size, batch_size)) < 0).cuda() optimizer = torch.optim.SGD(transformer_layer.parameters(), lr=0.01) profiler = Profiler(os.path.join('benchmark', args.exp_name)) num_epochs = 5 tot_time = 0 nproc = torch.distributed.get_world_size() for epoch in range(num_epochs): input_ = torch.rand(size=embedding_output.size()).cuda() overall_name = f'transformer_layer_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}' profiler.start(overall_name) fname = f'transformer_layer_forward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}' # Forward pass profiler.start(fname) loss = transformer_layer.forward(input_, attention_mask) train_loss = torch.mean(loss) # print(train_loss) torch.cuda.synchronize() profiler.stop(fname) # Backward pass bname = f'transformer_layer_backward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}' profiler.start(bname) optimizer.zero_grad() train_loss.backward() optimizer.step() torch.cuda.synchronize() profiler.stop(bname) profiler.stop(overall_name)
def __init__(self, dataset, config): super(Classifier, self).__init__() self.config = config assert len(self.config.feature.feature_names) == 1 assert self.config.feature.feature_names[0] == "token" or \ self.config.feature.feature_names[0] == "char" if config.embedding.type == EmbeddingType.EMBEDDING: self.token_embedding = \ Embedding(dataset.token_map, config.embedding.dimension, cDataset.DOC_TOKEN, config, dataset.VOCAB_PADDING, pretrained_embedding_file= config.feature.token_pretrained_file, mode=EmbeddingProcessType.FLAT, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, activation_type=ActivationType.NONE) self.char_embedding = \ Embedding(dataset.char_map, config.embedding.dimension, cDataset.DOC_CHAR, config, dataset.VOCAB_PADDING, mode=EmbeddingProcessType.FLAT, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, activation_type=ActivationType.NONE) elif config.embedding.type == EmbeddingType.REGION_EMBEDDING: self.token_embedding = RegionEmbeddingLayer( dataset.token_map, config.embedding.dimension, config.embedding.region_size, cDataset.DOC_TOKEN, config, padding=dataset.VOCAB_PADDING, pretrained_embedding_file= config.feature.token_pretrained_file, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, region_embedding_type=config.embedding.region_embedding_type) self.char_embedding = RegionEmbeddingLayer( dataset.char_map, config.embedding.dimension, config.embedding.region_size, cDataset.DOC_CHAR, config, padding=dataset.VOCAB_PADDING, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, region_embedding_type=config.embedding.region_embedding_type) else: raise TypeError( "Unsupported embedding type: %s. " % config.embedding.type) self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout)
class Graph(): def __init__(self, data, is_train, model_config): self.model_config = model_config self.data = data self.is_train = is_train self.model_fn = None self.rand_unif_init = tf.random_uniform_initializer(-0, .08, 0.08) self.metric = Metric(self.model_config, self.data) def embedding_fn(self, inputs, embedding): if type(inputs) == list: if not inputs: return [] else: return [ tf.nn.embedding_lookup(embedding, inp) for inp in inputs ] else: return tf.nn.embedding_lookup(embedding, inputs) def output_to_logit(self, prev_out, w, b): prev_logit = tf.add(tf.matmul(prev_out, tf.transpose(w)), b) return prev_logit def create_model_multigpu(self): losses = [] grads = [] ops = [tf.constant(0)] self.objs = [] self.global_step = tf.train.get_or_create_global_step() optim = self.get_optim() fetch_data = None if self.model_config.fetch_mode == 'tf_example_dataset': fetch_data = self.data.get_data_sample() with tf.variable_scope(tf.get_variable_scope()) as scope: for gpu_id in range(self.model_config.num_gpus): with tf.device('/device:GPU:%d' % gpu_id): with tf.name_scope('%s_%d' % ('gpu_scope', gpu_id)): loss, obj = self.create_model(fetch_data=fetch_data) if self.model_config.npad_mode == 'v1': vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope= 'model/transformer_decoder/decoder/layer_5/npad/' ) grad = optim.compute_gradients( loss, colocate_gradients_with_ops=True, var_list=vars) elif self.model_config.npad_mode == 'static_seq': vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/transformer_decoder/npad/') grad = optim.compute_gradients( loss, colocate_gradients_with_ops=True, var_list=vars) else: grad = optim.compute_gradients( loss, colocate_gradients_with_ops=True) tf.get_variable_scope().reuse_variables() losses.append(loss) grads.append(grad) if 'rule' in self.model_config.memory and self.is_train: ops.append(obj['mem_contexts']) ops.append(obj['mem_outputs']) ops.append(obj['mem_counter']) self.objs.append(obj) with tf.variable_scope('optimization'): self.loss = tf.divide(tf.add_n(losses), self.model_config.num_gpus) self.perplexity = tf.exp(tf.reduce_mean(self.loss)) if self.is_train: avg_grad = self.average_gradients(grads) grads = [g for (g, v) in avg_grad] clipped_grads, _ = tf.clip_by_global_norm( grads, self.model_config.max_grad_norm) if self.model_config.npad_mode == 'v1': vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/transformer_decoder/decoder/layer_5/npad/' ) elif self.model_config.npad_mode == 'static_seq': vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/transformer_decoder/npad/') else: vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) self.train_op = optim.apply_gradients( zip(clipped_grads, vars), global_step=self.global_step) self.increment_global_step = tf.assign_add(self.global_step, 1) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2) self.ops = tf.tuple(ops) def create_model(self, fetch_data=None): with tf.variable_scope('variables'): sentence_simple_input_placeholder = [] sentence_complex_input_placeholder = [] if self.model_config.subword_vocab_size and self.model_config.seg_mode: sentence_simple_segment_input_placeholder = [] sentence_complex_segment_input_placeholder = [] obj = {} if fetch_data is not None and self.model_config.fetch_mode == 'tf_example_dataset': for t in tf.unstack(fetch_data['line_comp_ids'], axis=1): sentence_complex_input_placeholder.append(t) for t in tf.unstack(fetch_data['line_simp_ids'], axis=1): sentence_simple_input_placeholder.append(t) if self.model_config.subword_vocab_size and self.model_config.seg_mode: for t in tf.unstack(fetch_data['line_comp_segids'], axis=1): sentence_complex_segment_input_placeholder.append(t) for t in tf.unstack(fetch_data['line_simp_segids'], axis=1): sentence_simple_segment_input_placeholder.append(t) obj['line_comp_segids'] = tf.stack( sentence_complex_segment_input_placeholder, axis=1) obj['line_simp_segids'] = tf.stack( sentence_simple_segment_input_placeholder, axis=1) score = None if self.model_config.tune_style: if self.is_train: # In training, score are from fetch data scores = [] if self.model_config.tune_style[0]: ppdb_score = fetch_data['ppdb_score'] scores.append(ppdb_score) print('Tune ppdb score!') if 'plus' in self.model_config.tune_mode: # to avoid most ppdb scores are 0 ppdb_score += 0.1 if self.model_config.tune_style[1]: add_score = fetch_data['dsim_score'] scores.append(add_score) print('Tune dsim_score score!') if self.model_config.tune_style[2]: add_score = fetch_data['add_score'] scores.append(add_score) print('Tune add score!') if self.model_config.tune_style[3]: len_score = fetch_data['len_score'] scores.append(len_score) print('Tune length score!') else: # In evaluating/predict, scores may be a factor to multiply if in pred mode # or actual user provided score # TODO(sanqiang): not used for now because not fech_data in eval raise NotImplementedError('No tune style for training') # ppdb_score = tf.constant( # self.model_config.tune_style[0], shape=[self.model_config.batch_size], dtype=tf.float32) # add_score = tf.constant( # self.model_config.tune_style[1], shape=[self.model_config.batch_size], dtype=tf.float32) # len_score = tf.constant( # self.model_config.tune_style[2], shape=[self.model_config.batch_size], dtype=tf.float32) # Assemble scores dimension_unit = int(self.model_config.dimension / len(scores)) dimension_runit = self.model_config.dimension - ( len(scores) - 1) * dimension_unit for s_i, score in enumerate(scores): if s_i < len(scores) - 1: scores[s_i] = tf.expand_dims(tf.tile( tf.expand_dims(scores[s_i], axis=-1), [1, dimension_unit]), axis=1) else: scores[s_i] = tf.expand_dims(tf.tile( tf.expand_dims(scores[s_i], axis=-1), [1, dimension_runit]), axis=1) score = tf.concat(scores, axis=-1) else: for step in range(self.model_config.max_simple_sentence): sentence_simple_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='simple_input')) for step in range(self.model_config.max_complex_sentence): sentence_complex_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='complex_input')) if self.model_config.subword_vocab_size and self.model_config.seg_mode: for step in range(self.model_config.max_simple_sentence): sentence_simple_segment_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='simple_seg_input')) for step in range(self.model_config.max_complex_sentence): sentence_complex_segment_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='complex_seg_input')) obj['line_comp_segids'] = tf.stack( sentence_complex_segment_input_placeholder, axis=1) obj['line_simp_segids'] = tf.stack( sentence_simple_segment_input_placeholder, axis=1) score = None if self.model_config.tune_style: if self.is_train: raise NotImplementedError('No tune style for training') # # ppdb_score = tf.constant( # self.model_config.tune_style, shape=[self.model_config.batch_size], dtype=tf.float32) # ppdb_score = tf.expand_dims(tf.tile( # tf.expand_dims(ppdb_score, axis=-1), # [1, self.model_config.dimension]), axis=1) else: scores = [] if self.model_config.tune_style: if self.model_config.tune_style[0]: ppdb_score = tf.constant( self.model_config.tune_style[0], shape=[self.model_config.batch_size], dtype=tf.float32) scores.append(ppdb_score) print('tune ppdb score') if self.model_config.tune_style[1]: dsim_score = tf.constant( self.model_config.tune_style[1], shape=[self.model_config.batch_size], dtype=tf.float32) scores.append(dsim_score) print('tune dsim score') if self.model_config.tune_style[2]: add_score = tf.constant( self.model_config.tune_style[2], shape=[self.model_config.batch_size], dtype=tf.float32) scores.append(add_score) print('tune add score') if self.model_config.tune_style[3]: len_score = tf.constant( self.model_config.tune_style[3], shape=[self.model_config.batch_size], dtype=tf.float32) scores.append(len_score) print('tune len score') # Assemble scores dimension_unit = int(self.model_config.dimension / len(scores)) dimension_runit = self.model_config.dimension - ( len(scores) - 1) * dimension_unit for s_i, score in enumerate(scores): if s_i < len(scores) - 1: scores[s_i] = tf.expand_dims(tf.tile( tf.expand_dims(scores[s_i], axis=-1), [1, dimension_unit]), axis=1) else: scores[s_i] = tf.expand_dims(tf.tile( tf.expand_dims(scores[s_i], axis=-1), [1, dimension_runit]), axis=1) score = tf.concat(scores, axis=-1) # For self.model_config.tune_style: comp_features = {} comp_add_score = tf.zeros(self.model_config.batch_size, tf.float32, name='comp_add_score_input') comp_length = tf.zeros(self.model_config.batch_size, tf.float32, name='comp_length_input') comp_features['comp_add_score'] = comp_add_score comp_features['comp_length'] = comp_length sentence_idxs = tf.zeros(self.model_config.batch_size, tf.int32, name='sent_idx') self.embedding = Embedding(self.data.vocab_complex, self.data.vocab_simple, self.model_config) if self.model_config.bert_mode: emb_complex = None else: emb_complex = self.embedding.get_complex_embedding() if self.model_config.bert_mode and ( self.model_config.tie_embedding == 'all' or self.model_config.tie_embedding == 'enc_dec'): emb_simple = None else: emb_simple = self.embedding.get_simple_embedding() if (self.is_train and self.model_config.pretrained_embedding): self.embed_complex_placeholder = tf.placeholder( tf.float32, (self.data.vocab_complex.vocab_size(), self.model_config.dimension), 'complex_emb') self.replace_emb_complex = emb_complex.assign( self.embed_complex_placeholder) self.embed_simple_placeholder = tf.placeholder( tf.float32, (self.data.vocab_simple.vocab_size(), self.model_config.dimension), 'simple_emb') self.replace_emb_simple = emb_simple.assign( self.embed_simple_placeholder) if self.model_config.bert_mode and ( self.model_config.tie_embedding == 'all' or self.model_config.tie_embedding == 'dec_out'): w = None else: w = self.embedding.get_w() b = self.embedding.get_b() mem_contexts, mem_outputs, mem_counter = None, None, None rule_id_input_placeholder, rule_target_input_placeholder = [], [] if 'rule' in self.model_config.memory: with tf.device('/cpu:0'): context_size = 0 if self.model_config.framework == 'transformer': context_size = 1 elif self.model_config.framework == 'seq2seq': context_size = 2 mem_contexts = tf.get_variable( 'mem_contexts', initializer=tf.constant( 0, dtype=tf.float32, shape=(self.data.vocab_rule.get_rule_size(), self.model_config.max_target_rule_sublen, self.model_config.dimension * context_size)), trainable=False, dtype=tf.float32) mem_outputs = tf.get_variable( 'mem_outputs', initializer=tf.constant( 0, dtype=tf.float32, shape=(self.data.vocab_rule.get_rule_size(), self.model_config.max_target_rule_sublen, self.model_config.dimension)), trainable=False, dtype=tf.float32) mem_counter = tf.get_variable( 'mem_counter', initializer=tf.constant( 0, dtype=tf.int32, shape=(self.data.vocab_rule.get_rule_size(), 1)), trainable=False, dtype=tf.int32) if 'direct' in self.model_config.memory or 'rule' in self.model_config.memory: if fetch_data is not None and self.model_config.fetch_mode == 'tf_example_dataset': for t in tf.unstack(fetch_data['rule_id'], axis=1): rule_id_input_placeholder.append(t) for t in tf.unstack(fetch_data['rule_target'], axis=1): rule_target_input_placeholder.append(t) else: for step in range(self.model_config.max_cand_rules): rule_id_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='rule_id_input')) for step in range(self.model_config.max_cand_rules): if 'direct' in self.model_config.memory: rule_target_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='rule_target_input')) elif 'rule' in self.model_config.memory: rule_target_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.string, name='rule_target_input')) with tf.variable_scope('model'): output = self.model_fn(sentence_complex_input_placeholder, emb_complex, sentence_simple_input_placeholder, emb_simple, w, b, rule_id_input_placeholder, rule_target_input_placeholder, mem_contexts, mem_outputs, self.global_step, score, comp_features, obj) encoder_embs, final_outputs = None, None if self.model_config.replace_unk_by_emb: encoder_embs = tf.stack(output.encoder_embed_inputs_list, axis=1) if output.decoder_outputs_list is not None: if type(output.decoder_outputs_list) == list: decoder_outputs_list = output.decoder_outputs_list decoder_outputs = tf.stack(decoder_outputs_list, axis=1) else: decoder_outputs = output.decoder_outputs_list if output.final_outputs_list is not None: if type(output.final_outputs_list) == list: final_outputs_list = output.final_outputs_list final_outputs = tf.stack(final_outputs_list, axis=1) else: final_outputs = output.final_outputs_list attn_distr = None if self.model_config.replace_unk_by_attn: attn_distr = output.attn_distr_list if not self.is_train: # in beam search, it directly provide decoder target list decoder_target = tf.stack(output.decoder_target_list, axis=1) loss = tf.reduce_mean(output.decoder_score) obj = { 'sentence_idxs': sentence_idxs, 'sentence_simple_input_placeholder': sentence_simple_input_placeholder, 'sentence_complex_input_placeholder': sentence_complex_input_placeholder, 'decoder_target_list': decoder_target, 'final_outputs': final_outputs, 'encoder_embs': encoder_embs, 'attn_distr': attn_distr } if self.model_config.subword_vocab_size and self.model_config.seg_mode: obj['sentence_complex_segment_input_placeholder'] = sentence_complex_segment_input_placeholder obj['sentence_simple_segment_input_placeholder'] = sentence_simple_segment_input_placeholder if 'rule' in self.model_config.memory or 'direct' in self.model_config.memory: obj['rule_id_input_placeholder'] = rule_id_input_placeholder obj['rule_target_input_placeholder'] = rule_target_input_placeholder if self.model_config.tune_style: obj['comp_features'] = comp_features return loss, obj else: # Memory Populate if 'rule' in self.model_config.memory: # Update Memory through python injection def update_memory(mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp, decoder_targets, decoder_outputs, contexts, rule_target_input_placeholder, rule_id_input_placeholder, global_step, encoder_outputs): def _seq_contain(arr, tar): j = 0 for i in range(len(arr)): if arr[i] == tar[j]: j += 1 if j == len(tar): return i - len(tar) + 1 else: j = 0 return -1 # if 'stopgrad' in self.model_config.rl_configs and global_step % 2 != 0: # return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp # if global_step <= self.model_config.memory_prepare_step: # return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp batch_size = np.shape(rule_target_input_placeholder)[0] max_rules = np.shape(rule_target_input_placeholder)[1] decoder_targets_str = [ ' '.join(sent) for sent in truncate_sents( decode( decoder_targets, self.data.vocab_simple, self.model_config.subword_vocab_size > 0 or 'bert_token' in self.model_config.bert_mode)) ] for batch_id in range(batch_size): cur_decoder_targets = decoder_targets[batch_id, :] cur_decoder_targets_str = decoder_targets_str[ batch_id] cur_decoder_outputs = decoder_outputs[batch_id, :] cur_contexts = contexts[batch_id, :] cur_rule_target_input_placeholder = rule_target_input_placeholder[ batch_id, :] cur_rule_target_input_placeholder = [ tmp.decode("utf-8").strip('\x00') for tmp in cur_rule_target_input_placeholder if not tmp.decode("utf-8").strip().startswith( constant.SYMBOL_PAD) ] cur_rule_id_input_placeholder = rule_id_input_placeholder[ batch_id, :] # Build the valid mapper from rule id => target words ids rule_mapper = {} for step in range( len(cur_rule_target_input_placeholder)): rule_target_str = cur_rule_target_input_placeholder[ step] if rule_target_str == constant.SYMBOL_PAD: continue rule_id = cur_rule_id_input_placeholder[step] if rule_id != 0 and re.search( r'\b%s\b' % rule_target_str, cur_decoder_targets_str ): # decoder_target_str in cur_decoder_targets_str: decoder_target_wids = self.data.vocab_simple.encode( rule_target_str) dec_s_idx = _seq_contain( cur_decoder_targets, decoder_target_wids) if dec_s_idx != -1: print('rule_target_str:%s' % rule_target_str) print('cur_decoder_targets_str:%s' % cur_decoder_targets_str) print('cur_decoder_targets:%s' % cur_decoder_targets) print('decoder_target_wids:%s' % decoder_target_wids) rule_mapper[rule_id] = list( range( dec_s_idx, dec_s_idx + len(decoder_target_wids))) for rule_id in rule_mapper: dec_idxs = rule_mapper[rule_id] for idx, dec_idx in enumerate(dec_idxs): if mem_counter_tmp[rule_id, 0] == 0: mem_contexts_tmp[ rule_id, idx, :] = cur_contexts[dec_idx, :] mem_outputs_tmp[ rule_id, idx, :] = cur_decoder_outputs[ dec_idx, :] else: mem_contexts_tmp[rule_id, idx, :] = ( cur_contexts[dec_idx, :] + mem_contexts_tmp[rule_id, idx, :]) / 2 mem_outputs_tmp[rule_id, idx, :] = ( cur_decoder_outputs[dec_idx, :] + mem_outputs_tmp[rule_id, idx, :]) / 2 mem_counter_tmp[rule_id, 0] += 1 return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp mem_output_input = None if 'mofinal' in self.model_config.memory_config: mem_output_input = final_outputs # elif 'modecode' in self.model_config.memory_config: # mem_output_input = decoder_outputs # elif 'moemb' in self.model_config.memory_config: # mem_output_input = tf.stack( # self.embedding_fn(sentence_simple_input_placeholder, emb_simple), # axis=1) mem_contexts, mem_outputs, mem_counter = tf.py_func( update_memory, [ mem_contexts, mem_outputs, mem_counter, tf.stack(output.decoder_target_list, axis=1), mem_output_input, output.contexts, tf.stack(rule_target_input_placeholder, axis=1), tf.stack(rule_id_input_placeholder, axis=1), self.global_step, output.encoder_outputs ], [tf.float32, tf.float32, tf.int32], stateful=False, name='update_memory') #Loss and corresponding prior/mask decode_word_weight_list = [ tf.to_float( tf.not_equal( d, self.data.vocab_simple.encode( constant.SYMBOL_PAD))) for d in output.gt_target_list ] decode_word_weight = tf.stack(decode_word_weight_list, axis=1) gt_target = tf.stack(output.gt_target_list, axis=1) def self_critical_loss(): # For minimize the negative log of probabilities rewards = tf.py_func( self.metric.self_crititcal_reward, [ sentence_idxs, tf.stack(output.sample_target_list, axis=-1), tf.stack(output.decoder_target_list, axis=-1), tf.stack(sentence_simple_input_placeholder, axis=-1), tf.stack(sentence_complex_input_placeholder, axis=-1), tf.ones((1, 1)), # tf.stack(rule_target_input_placeholder, axis=1) ], tf.float32, stateful=False, name='reward') rewards.set_shape((self.model_config.batch_size, self.model_config.max_simple_sentence)) rewards = tf.unstack(rewards, axis=1) weighted_probs_list = [ rewards[i] * decode_word_weight_list[i] * -output.sample_logit_list[i] for i in range(len(decode_word_weight_list)) ] total_size = tf.reduce_sum(decode_word_weight_list) total_size += 1e-12 weighted_probs = tf.reduce_sum( weighted_probs_list) / total_size loss = weighted_probs return loss def teacherforce_critical_loss(): losses = [] for step in range(self.model_config.max_simple_sentence): logit = output.decoder_logit_list[step] greedy_target_unit = tf.stop_gradient( tf.argmax(logit, axis=1)) if self.model_config.train_mode == 'teachercriticalv2': sampled_target_unit, reward = tf.py_func( self.metric.self_crititcal_reward_unitv2, [ sentence_idxs, step, greedy_target_unit, tf.stack(sentence_simple_input_placeholder, axis=-1), tf.stack( sentence_complex_input_placeholder, axis=-1), self.global_step ], [tf.int32, tf.float32], stateful=False, name='reward') reward.set_shape((self.model_config.batch_size, )) sampled_target_unit.set_shape( (self.model_config.batch_size, )) elif self.model_config.train_mode == 'teachercritical': sampled_target_unit = tf.cast( tf.squeeze(tf.multinomial(logit, 1), axis=1), tf.int32) sampled_target_unit, reward = tf.py_func( self.metric.self_crititcal_reward_unit, [ sentence_idxs, step, sampled_target_unit, greedy_target_unit, tf.stack(sentence_simple_input_placeholder, axis=-1), tf.stack( sentence_complex_input_placeholder, axis=-1), self.global_step, ], [tf.int32, tf.float32], stateful=False, name='reward') reward.set_shape((self.model_config.batch_size, )) sampled_target_unit.set_shape( (self.model_config.batch_size, )) indices = tf.stack([ tf.range(0, self.model_config.batch_size, dtype=tf.int32), tf.squeeze(sampled_target_unit) ], axis=-1) logit_unit = tf.gather_nd(tf.nn.softmax(logit, axis=1), indices) decode_word_weight = decode_word_weight_list[step] losses.append(-logit_unit * reward * decode_word_weight) loss = tf.add_n(losses) return loss def teacherforce_loss(): if self.model_config.number_samples > 0: loss_fn = tf.nn.sampled_softmax_loss else: loss_fn = None loss = sequence_loss( logits=tf.stack(output.decoder_logit_list, axis=1), targets=gt_target, weights=decode_word_weight, # softmax_loss_function=loss_fn, # w=w, # b=b, # decoder_outputs=decoder_outputs, # number_samples=self.model_config.number_samples ) return loss if self.model_config.train_mode == 'dynamic_self-critical': loss = self_critical_loss() # loss = tf.cond( # tf.greater(self.global_step, 50000), # # tf.logical_and(tf.greater(self.global_step, 100000), tf.equal(tf.mod(self.global_step, 2), 0)), # lambda : self_critical_loss(), # lambda : teacherforce_loss()) elif self.model_config.train_mode == 'teachercritical' or self.model_config.train_mode == 'teachercriticalv2': loss = tf.cond(tf.equal(tf.mod(self.global_step, 2), 0), lambda: teacherforce_loss(), lambda: teacherforce_critical_loss()) # loss = teacherforce_critical_loss() else: loss = teacherforce_loss() if self.model_config.architecture == 'ut2t': assert 'extra_encoder_loss' in output.obj_tensors and 'extra_decoder_loss' in output.obj_tensors loss += output.obj_tensors['extra_encoder_loss'] loss += output.obj_tensors['extra_decoder_loss'] print('Use U T2T with ACT') self.loss_style = tf.constant(0.0, dtype=tf.float32) if output.pred_score_tuple is not None and 'pred' in self.model_config.tune_mode: print('Create loss for predicting style') ppdb_pred_score, add_pred_score, len_pred_score = output.pred_score_tuple # ppdb_pred_score = tf.Print(ppdb_pred_score, [ppdb_pred_score, fetch_data['ppdb_score']], # message='ppdb_pred_score:', first_n=-1, summarize=100) # add_pred_score = tf.Print(add_pred_score, [add_pred_score, fetch_data['add_score']], # message='add_pred_score:', first_n=-1, summarize=100) # len_pred_score = tf.Print(len_pred_score, [len_pred_score, fetch_data['len_score']], # message='len_pred_score:', first_n=-1, summarize=100) # loss = tf.Print(loss, [loss], message='loss before:', summarize=100) self.loss_style += tf.losses.absolute_difference( ppdb_pred_score, fetch_data['ppdb_score']) self.loss_style += tf.losses.absolute_difference( add_pred_score, fetch_data['add_score']) self.loss_style += tf.losses.absolute_difference( len_pred_score, fetch_data['len_score']) loss += self.loss_style # loss = tf.Print(loss, [loss], message='loss after:', summarize=100) obj = { 'decoder_target_list': output.decoder_target_list, 'sentence_idxs': sentence_idxs, 'sentence_simple_input_placeholder': sentence_simple_input_placeholder, 'sentence_complex_input_placeholder': sentence_complex_input_placeholder, } self.logits = output.decoder_logit_list if 'rule' in self.model_config.memory: obj['rule_id_input_placeholder'] = rule_id_input_placeholder obj['rule_target_input_placeholder'] = rule_target_input_placeholder # obj['rule_pair_input_placeholder'] = rule_pair_input_placeholder obj['mem_contexts'] = mem_contexts obj['mem_outputs'] = mem_outputs obj['mem_counter'] = mem_counter return loss, obj def get_optim(self): learning_rate = tf.constant(self.model_config.learning_rate) if self.model_config.optimizer == 'adagrad': opt = tf.train.AdagradOptimizer(learning_rate) # Adam need lower learning rate elif self.model_config.optimizer == 'adam': opt = tf.train.AdamOptimizer(learning_rate) elif self.model_config.optimizer == 'lazy_adam': if not hasattr(self, 'hparams'): # In case not using Transformer model from tensor2tensor.models import transformer self.hparams = transformer.transformer_base() opt = tf.contrib.opt.LazyAdamOptimizer( self.hparams.learning_rate / 100.0, beta1=self.hparams.optimizer_adam_beta1, beta2=self.hparams.optimizer_adam_beta2, epsilon=self.hparams.optimizer_adam_epsilon) elif self.model_config.optimizer == 'adadelta': opt = tf.train.AdadeltaOptimizer(learning_rate) elif self.model_config.optimizer == 'sgd': opt = tf.train.GradientDescentOptimizer(learning_rate) else: raise Exception('Not Implemented Optimizer!') # if self.model_config.max_grad_staleness > 0: # opt = tf.contrib.opt.DropStaleGradientOptimizer(opt, self.model_config.max_grad_staleness) return opt # Got from https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py#L101 def average_gradients(self, tower_grads): """Calculate the average gradient for each shared variable across all towers. Note that this function provides a synchronization point across all towers. Args: tower_grads: List of lists of (gradient, variable) tuples. The outer list is over individual gradients. The inner list is over the gradient calculation for each tower. Returns: List of pairs of (gradient, variable) where the gradient has been averaged across all towers. """ average_grads = [] for grad_and_vars in zip(*tower_grads): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) grads = [] for g, _ in grad_and_vars: # Add 0 dimension to the gradients to represent the tower. if g is None: print('Useless tensors:%s' % grad_and_vars) expanded_g = tf.expand_dims(g, 0) # Append on a 'tower' dimension which we will average over below. grads.append(expanded_g) # Average over the 'tower' dimension. grad = tf.concat(axis=0, values=grads) grad = tf.reduce_mean(grad, 0) # Keep in mind that the Variables are redundant because they are shared # across towers. So .. we will just return the first tower's pointer to # the Variable. v = grad_and_vars[0][1] grad_and_var = (grad, v) average_grads.append(grad_and_var) return average_grads
class Classifier(torch.nn.Module): def __init__(self, dataset, config): super(Classifier, self).__init__() self.config = config self.token_embedding = \ Embedding(dataset.token_map, config.embedding.dimension, cDataset.DOC_TOKEN, config, dataset.VOCAB_PADDING, pretrained_embedding_file= config.feature.token_pretrained_file, mode=EmbeddingProcessType.FLAT, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, activation_type=ActivationType.NONE, model_mode=dataset.model_mode) self.char_embedding = \ Embedding(dataset.char_map, config.embedding.dimension, cDataset.DOC_CHAR, config, dataset.VOCAB_PADDING, mode=EmbeddingProcessType.FLAT, dropout=self.config.embedding.dropout, init_type=self.config.embedding.initializer, low=-self.config.embedding.uniform_bound, high=self.config.embedding.uniform_bound, std=self.config.embedding.random_stddev, fan_mode=self.config.embedding.fan_mode, activation_type=ActivationType.NONE, model_mode=dataset.model_mode) self.dropout = torch.nn.Dropout(p=config.train.hidden_layer_dropout) def get_embedding(self, batch, pad_shape=None, pad_value=0): token_id = batch[cDataset.DOC_TOKEN].to(self.config.device) if pad_shape is not None: token_id = torch.nn.functional.pad(token_id, pad_shape, mode='constant', value=pad_value) embedding = self.token_embedding(token_id) length = batch[cDataset.DOC_TOKEN_LEN].to(self.config.device) mask = batch[cDataset.DOC_TOKEN_MASK].to(self.config.device) return embedding, length, mask def get_parameter_optimizer_dict(self): params = list() params.append({ 'params': self.token_embedding.parameters(), 'is_embedding': True }) return params def update_lr(self, optimizer, epoch): """Update lr """ if epoch > self.config.train.num_epochs_static_embedding: for param_group in optimizer.param_groups[:2]: param_group["lr"] = self.config.optimizer.learning_rate else: for param_group in optimizer.param_groups[:2]: param_group["lr"] = 0 def forward(self, batch): raise NotImplementedError
def parse_emb_str(embs_str: str): emb_strs = embs_str.split() emb = Embedding() for emb_str in emb_strs: emb.add_dim(float(emb_str)) return emb
import numpy as np import os import random import jieba # import thulac # thulac_seg = thulac.thulac(seg_only=True) max_sample = 1000 max_steps = 15 n_embedding = 300 base_path = os.path.dirname(os.path.realpath(__file__)) path = os.path.join(base_path, 'word2vec_model/wiki.zh.nosync/wiki.zh.vec') model_path = os.path.join(base_path, Config.CURRENT_MODEL_BASE_PATH) embedding = Embedding() embedding.load_w2v_model(path, False) data = dict() data_dir_path = os.path.join(model_path, 'data.nosync') for fname in os.listdir(data_dir_path): if '.txt' not in fname: continue train_file_path = os.path.join(data_dir_path, fname) with open(train_file_path, 'r') as f: for l in f: comps = l.strip().split() label = int(comps[1]) if label not in data: data[label] = [] data[label].append((comps[0], label))
def create_model(self): with tf.variable_scope('variables'): sentence_simple_input_placeholder = [] for step in range(self.model_config.max_simple_sentence): sentence_simple_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='simple_input')) sentence_complex_input_placeholder = [] for step in range(self.model_config.max_complex_sentence): sentence_complex_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='complex_input')) sentence_idxs = tf.zeros(self.model_config.batch_size, tf.int32, name='sent_idx') embedding = Embedding(self.data.vocab_complex, self.data.vocab_simple, self.model_config) emb_complex = embedding.get_complex_embedding() emb_simple = embedding.get_simple_embedding() w = embedding.get_w() b = embedding.get_b() mem_contexts, mem_outputs, mem_counter = None, None, None rule_id_input_placeholder, rule_target_input_placeholder = [], [] rule_pair_input_placeholder = [] if 'rule' in self.model_config.memory: with tf.device('/cpu:0'): context_size = 0 if self.model_config.framework == 'transformer': context_size = 1 elif self.model_config.framework == 'seq2seq': context_size = 2 mem_contexts = tf.get_variable( 'mem_contexts', initializer=tf.constant( 0, dtype=tf.float32, shape=(self.data.vocab_rule.get_rule_size(), self.model_config.dimension * context_size)), trainable=False, dtype=tf.float32) mem_outputs = tf.get_variable( 'mem_outputs', initializer=tf.constant( 0, dtype=tf.float32, shape=(self.data.vocab_rule.get_rule_size(), self.model_config.dimension)), trainable=False, dtype=tf.float32) mem_counter = tf.get_variable( 'mem_counter', initializer=tf.constant( 0, dtype=tf.int32, shape=(self.data.vocab_rule.get_rule_size(), 1)), trainable=False, dtype=tf.int32) for step in range(self.model_config.max_cand_rules): rule_id_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='rule_id_input')) for step in range(self.model_config.max_cand_rules): rule_target_input_placeholder.append( tf.zeros(self.model_config.batch_size, tf.int32, name='rule_target_input')) for step in range(self.model_config.max_cand_rules): rule_pair_input_placeholder.append( tf.zeros([self.model_config.batch_size, 2], tf.int32, name='rule_pair_input')) with tf.variable_scope('model'): output = self.model_fn(sentence_complex_input_placeholder, emb_complex, sentence_simple_input_placeholder, emb_simple, w, b, rule_id_input_placeholder, mem_contexts, mem_outputs, self.global_step) encoder_embs, final_outputs = None, None if self.model_config.replace_unk_by_emb: encoder_embs = tf.stack(output.encoder_embed_inputs_list, axis=1) if output.decoder_outputs_list is not None: if type(output.decoder_outputs_list) == list: decoder_outputs_list = output.decoder_outputs_list decoder_outputs = tf.stack(decoder_outputs_list, axis=1) else: decoder_outputs = output.decoder_outputs_list if output.final_outputs_list is not None: if type(output.final_outputs_list) == list: final_outputs_list = output.final_outputs_list final_outputs = tf.stack(final_outputs_list, axis=1) else: final_outputs = output.final_outputs_list attn_distr = None if self.model_config.replace_unk_by_attn: attn_distr = output.attn_distr_list if not self.is_train: # in beam search, it directly provide decoder target list decoder_target = tf.stack(output.decoder_target_list, axis=1) loss = tf.reduce_mean(output.decoder_score) obj = { 'sentence_idxs': sentence_idxs, 'sentence_simple_input_placeholder': sentence_simple_input_placeholder, 'sentence_complex_input_placeholder': sentence_complex_input_placeholder, 'decoder_target_list': decoder_target, 'final_outputs': final_outputs, 'encoder_embs': encoder_embs, 'attn_distr': attn_distr } if 'rule' in self.model_config.memory: obj['rule_id_input_placeholder'] = rule_id_input_placeholder obj['rule_target_input_placeholder'] = rule_target_input_placeholder return loss, obj else: # Memory Populate if 'rule' in self.model_config.memory: # Update Memory through python injection def update_memory(mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp, decoder_targets, decoder_outputs, contexts, rule_target_input_placeholder, rule_id_input_placeholder, global_step, emb_simple, encoder_outputs): if global_step <= self.model_config.memory_prepare_step: return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp batch_size = np.shape(rule_target_input_placeholder)[0] max_rules = np.shape(rule_target_input_placeholder)[1] for batch_id in range(batch_size): cur_decoder_targets = decoder_targets[batch_id, :] cur_decoder_outputs = decoder_outputs[batch_id, :] cur_contexts = contexts[batch_id, :] cur_rule_target_input_placeholder = rule_target_input_placeholder[ batch_id, :] cur_rule_id_input_placeholder = rule_id_input_placeholder[ batch_id, :] rule_mapper = {} for step in range(max_rules): rule_id = cur_rule_id_input_placeholder[step] if rule_id != 0: decoder_target = cur_rule_target_input_placeholder[ step] if rule_id not in rule_mapper: rule_mapper[rule_id] = [] rule_mapper[rule_id].append(decoder_target) for rule_id in rule_mapper: rule_targets = rule_mapper[rule_id] decoder_target_orders = np.where( cur_decoder_targets == rule_targets[0])[0] for decoder_target_order in decoder_target_orders: if len(rule_targets) > 1: if decoder_target_order + 1 >= len( cur_decoder_targets ) or rule_targets[ 1] != cur_decoder_targets[ decoder_target_order + 1]: continue if len(rule_targets) > 2: if decoder_target_order + 2 >= len( cur_decoder_targets ) or rule_targets[ 2] != cur_decoder_targets[ decoder_target_order + 2]: continue cur_context, cur_outputs = None, None for step, _ in enumerate(rule_targets): if step == 0: cur_context = cur_contexts[ decoder_target_order, :] cur_outputs = cur_decoder_outputs[ decoder_target_order, :] else: cur_context += cur_contexts[ step + decoder_target_order, :] cur_outputs += cur_decoder_outputs[ step + decoder_target_order, :] cur_context /= len(rule_targets) cur_outputs /= len(rule_targets) if mem_counter_tmp[rule_id, 0] == 0: mem_contexts_tmp[ rule_id, :] = cur_context mem_outputs_tmp[ rule_id, :] = cur_outputs else: mem_contexts_tmp[rule_id, :] = ( cur_context + mem_contexts_tmp[rule_id, :]) / 2 mem_outputs_tmp[rule_id, :] = ( cur_outputs + mem_outputs_tmp[rule_id, :]) / 2 mem_counter_tmp[rule_id, 0] += 1 return mem_contexts_tmp, mem_outputs_tmp, mem_counter_tmp mem_output_input = None if 'mofinal' in self.model_config.memory_config: mem_output_input = final_outputs # elif 'modecode' in self.model_config.memory_config: # mem_output_input = decoder_outputs # elif 'moemb' in self.model_config.memory_config: # mem_output_input = tf.stack( # self.embedding_fn(sentence_simple_input_placeholder, emb_simple), # axis=1) mem_contexts, mem_outputs, mem_counter = tf.py_func( update_memory, [ mem_contexts, mem_outputs, mem_counter, tf.stack(output.decoder_target_list, axis=1), mem_output_input, output.contexts, tf.stack(rule_target_input_placeholder, axis=1), tf.stack(rule_id_input_placeholder, axis=1), self.global_step, emb_simple, output.encoder_outputs ], [tf.float32, tf.float32, tf.int32], stateful=False, name='update_memory') #Loss and corresponding prior/mask decode_word_weight_list = [ tf.to_float( tf.not_equal( d, self.data.vocab_simple.encode( constant.SYMBOL_PAD))) for d in output.gt_target_list ] decode_word_weight = tf.stack(decode_word_weight_list, axis=1) gt_target = tf.stack(output.gt_target_list, axis=1) def self_critical_loss(): # For minimize the negative log of probabilities rewards = tf.py_func( self.metric.self_crititcal_reward, [ sentence_idxs, tf.stack(output.sample_target_list, axis=-1), tf.stack(output.decoder_target_list, axis=-1), tf.stack(sentence_simple_input_placeholder, axis=-1), tf.stack(sentence_complex_input_placeholder, axis=-1), tf.ones((1, 1)), # tf.stack(rule_target_input_placeholder, axis=1) ], tf.float32, stateful=False, name='reward') rewards.set_shape((self.model_config.batch_size, self.model_config.max_simple_sentence)) rewards = tf.unstack(rewards, axis=1) weighted_probs_list = [ rewards[i] * decode_word_weight_list[i] * -output.sample_logit_list[i] for i in range(len(decode_word_weight_list)) ] total_size = tf.reduce_sum(decode_word_weight_list) total_size += 1e-12 weighted_probs = tf.reduce_sum( weighted_probs_list) / total_size loss = weighted_probs return loss def teacherforce_critical_loss(): losses = [] for step in range(self.model_config.max_simple_sentence): logit = output.decoder_logit_list[step] greedy_target_unit = tf.stop_gradient( tf.argmax(logit, axis=1)) if self.model_config.train_mode == 'teachercriticalv2': sampled_target_unit, reward = tf.py_func( self.metric.self_crititcal_reward_unitv2, [ sentence_idxs, step, greedy_target_unit, tf.stack(sentence_simple_input_placeholder, axis=-1), tf.stack( sentence_complex_input_placeholder, axis=-1), ], [tf.int32, tf.float32], stateful=False, name='reward') reward.set_shape((self.model_config.batch_size, )) sampled_target_unit.set_shape( (self.model_config.batch_size, )) elif self.model_config.train_mode == 'teachercritical': sampled_target_unit = tf.cast( tf.squeeze(tf.multinomial(logit, 1), axis=1), tf.int32) reward = tf.py_func( self.metric.self_crititcal_reward_unit, [ sentence_idxs, step, sampled_target_unit, greedy_target_unit, tf.stack(sentence_simple_input_placeholder, axis=-1), tf.stack( sentence_complex_input_placeholder, axis=-1), tf.ones((1, 1)), ], tf.float32, stateful=False, name='reward') reward.set_shape((self.model_config.batch_size, )) indices = tf.stack([ tf.range(0, self.model_config.batch_size, dtype=tf.int32), tf.squeeze(sampled_target_unit) ], axis=-1) logit_unit = tf.gather_nd(tf.nn.softmax(logit, axis=1), indices) decode_word_weight = decode_word_weight_list[step] losses.append(-logit_unit * reward * decode_word_weight) loss = tf.add_n(losses) return loss def teacherforce_loss(): if self.model_config.number_samples > 0: loss_fn = tf.nn.sampled_softmax_loss else: loss_fn = None loss = sequence_loss( logits=tf.stack(output.decoder_logit_list, axis=1), targets=gt_target, weights=decode_word_weight, # softmax_loss_function=loss_fn, # w=w, # b=b, # decoder_outputs=decoder_outputs, # number_samples=self.model_config.number_samples ) return loss if self.model_config.train_mode == 'dynamic_self-critical': loss = self_critical_loss() # loss = tf.cond( # tf.greater(self.global_step, 50000), # # tf.logical_and(tf.greater(self.global_step, 100000), tf.equal(tf.mod(self.global_step, 2), 0)), # lambda : self_critical_loss(), # lambda : teacherforce_loss()) elif self.model_config.train_mode == 'teachercritical' or self.model_config.train_mode == 'teachercriticalv2': loss = tf.cond(tf.equal(tf.mod(self.global_step, 3), 0), lambda: teacherforce_loss(), lambda: teacherforce_critical_loss()) # loss = teacherforce_critical_loss() else: loss = teacherforce_loss() # if 'ruleattn' in self.model_config.external_loss: # batch_pos = tf.range( # self.model_config.batch_size * self.model_config.max_cand_rules) // self.model_config.max_cand_rules # batch_pos = tf.reshape( # batch_pos, [self.model_config.batch_size, self.model_config.max_cand_rules]) # batch_pos = tf.expand_dims(batch_pos, axis=2) # ids = tf.stack(rule_pair_input_placeholder, axis=1) # bias = 1.0 - tf.to_float( # tf.logical_and(tf.equal(ids[:, :, 0], 0), tf.equal(ids[:, :, 1], 0))) # ids = tf.concat([batch_pos, ids], axis=2) # distrs = tf.stack(output.attn_distr_list, axis=1) # ruleattn_loss = -tf.gather_nd(distrs, ids)*bias # loss += ruleattn_loss # self.pairs = tf.stack(rule_pair_input_placeholder, axis=1) obj = { 'sentence_idxs': sentence_idxs, 'sentence_simple_input_placeholder': sentence_simple_input_placeholder, 'sentence_complex_input_placeholder': sentence_complex_input_placeholder, } self.logits = output.decoder_logit_list if 'rule' in self.model_config.memory: obj['rule_id_input_placeholder'] = rule_id_input_placeholder obj['rule_target_input_placeholder'] = rule_target_input_placeholder obj['rule_pair_input_placeholder'] = rule_pair_input_placeholder obj['mem_contexts'] = mem_contexts obj['mem_outputs'] = mem_outputs obj['mem_counter'] = mem_counter return loss, obj
random.seed, torch.manual_seed, torch.cuda.manual_seed_all ]: set_random_seed(opts.seed) base_model = opts.backbone(pretrained=True) if opts.l2norm: model = L2NormEmbedding( base_model, feature_size=base_model.output_size, embedding_size=opts.embedding_size, ).cuda() else: model = Embedding( base_model, feature_size=base_model.output_size, embedding_size=opts.embedding_size, ).cuda() if opts.load is not None: model.load_state_dict(torch.load(opts.load)) print("Loaded Model from %s" % opts.load) train_transform, test_transform = build_transform(base_model) dataset_train = dataset.FashionInshop(opts.data, split="train", transform=train_transform) dataset_query = dataset.FashionInshop(opts.data, split="query", transform=test_transform)