def __init__(self, pc, conf: NodeExtractorConfHead, vocab: HLabelVocab, extract_type: str): super().__init__(pc, conf, vocab, extract_type) # node selector conf.sel_conf._input_dim = conf._input_dim # make dims fit self.sel: NodeSelector = self.add_sub_node( "sel", NodeSelector(pc, conf.sel_conf)) # encoding self.dmxnn = conf.dmxnn self.posi_embed = self.add_sub_node( "pe", RelPosiEmbedding(pc, conf.posi_dim, max=conf.posi_cut)) if self.dmxnn: conf.e_enc._input_dim = conf._input_dim + conf.posi_dim else: conf.e_enc._input_dim = conf._input_dim self.e_encoder = self.add_sub_node("ee", MyEncoder(pc, conf.e_enc)) e_enc_dim = self.e_encoder.get_output_dims()[0] # decoding # todo(note): dropout after pooling; todo(+N): cannot go to previous layers if there are no encoders self.special_drop = self.add_sub_node("sd", Dropout(pc, (e_enc_dim, ))) self.use_lab_f = conf.use_lab_f self.lab_f_use_lexi = conf.lab_f_use_lexi if self.use_lab_f: lab_f_input_dims = [e_enc_dim] * 3 if self.dmxnn else [e_enc_dim] if self.lab_f_use_lexi: lab_f_input_dims.append(conf._lexi_dim) self.lab_f = self.add_sub_node( "lab", Affine(pc, lab_f_input_dims, conf.lab_conf.n_dim, act=conf.lab_f_act)) else: self.lab_f = lambda x: x[0] # only use the first one # secondary type self.use_secondary_type = conf.use_secondary_type if self.use_secondary_type: # todo(note): re-use vocab; or totally reuse the predictor? if conf.sectype_reuse_hl: self.hl2: HLabelNode = self.hl else: new_lab_conf = deepcopy(conf.lab_conf) new_lab_conf.zero_nil = False # todo(note): not zero_nil here! self.hl2: HLabelNode = self.add_sub_node( "hl", HLabelNode(pc, new_lab_conf, vocab)) # enc+t1 -> t2 self.t1tot2 = self.add_sub_node( "1to2", Embedding(pc, self.hl_output_size, conf.lab_conf.n_dim)) else: self.hl2 = None self.t1tot2 = None
def __init__(self, pc: BK.ParamCollection, sconf: ScorerConf): super().__init__(pc, None, None) # options input_dim = sconf._input_dim arc_space = sconf.arc_space lab_space = sconf.lab_space ff_hid_size = sconf.ff_hid_size ff_hid_layer = sconf.ff_hid_layer use_biaffine = sconf.use_biaffine use_ff = sconf.use_ff use_ff2 = sconf.use_ff2 biaffine_div = sconf.biaffine_div biaffine_init_ortho = sconf.biaffine_init_ortho transform_act = sconf.transform_act # self.input_dim = input_dim self.num_label = sconf._num_label # attach/arc self.arc_m = self.add_sub_node("am", Affine(pc, input_dim, arc_space, act=transform_act)) self.arc_h = self.add_sub_node("ah", Affine(pc, input_dim, arc_space, act=transform_act)) self.arc_scorer = self.add_sub_node( "as", BiAffineScorer(pc, arc_space, arc_space, 1, ff_hid_size, ff_hid_layer=ff_hid_layer, use_biaffine=use_biaffine, use_ff=use_ff, use_ff2=use_ff2, biaffine_div=biaffine_div, biaffine_init_ortho=biaffine_init_ortho)) # only add distance for arc if sconf.arc_dist_clip > 0: # todo(+N): how to include dist feature? # self.dist_helper = self.add_sub_node("dh", AttDistHelper(pc, sconf.get_dist_aconf(), arc_space)) self.dist_helper = None raise NotImplemented("TODO") else: self.dist_helper = None # labeling self.lab_m = self.add_sub_node("lm", Affine(pc, input_dim, lab_space, act=transform_act)) self.lab_h = self.add_sub_node("lh", Affine(pc, input_dim, lab_space, act=transform_act)) self.lab_scorer = self.add_sub_node( "ls", BiAffineScorer(pc, lab_space, lab_space, self.num_label, ff_hid_size, ff_hid_layer=ff_hid_layer, use_biaffine=use_biaffine, use_ff=use_ff, use_ff2=use_ff2, biaffine_div=biaffine_div, biaffine_init_ortho=biaffine_init_ortho))
def __init__(self, pc: BK.ParamCollection, pname: str, input_dim: int, conf: SeqCrfNodeConf, inputter: Inputter): super().__init__(pc, conf, name="CRF") self.conf = conf self.inputter = inputter self.input_dim = input_dim # this step is performed at the embedder, thus still does not influence the inputter self.add_root_token = self.inputter.embedder.add_root_token # -- self.pname = pname self.attr_name = pname + "_seq" # attribute name in Instance self.vocab = inputter.vpack.get_voc(pname) # todo(note): we must make sure that 0 means NAN assert self.vocab.non == 0 # models if conf.hid_dim <= 0: # no hidden layer self.hid_layer = None self.pred_input_dim = input_dim else: self.hid_layer = self.add_sub_node( "hid", Affine(pc, input_dim, conf.hid_dim, act=conf.hid_act)) self.pred_input_dim = conf.hid_dim self.tagset_size = self.vocab.unk # todo(note): UNK is the prediction boundary self.pred_layer = self.add_sub_node( "pr", Affine(pc, self.pred_input_dim, self.tagset_size + 2, init_rop=NoDropRop())) # transition matrix init_transitions = np.zeros( [self.tagset_size + 2, self.tagset_size + 2]) init_transitions[:, START_TAG] = -10000.0 init_transitions[STOP_TAG, :] = -10000.0 init_transitions[:, 0] = -10000.0 init_transitions[0, :] = -10000.0 self.transitions = self.add_param( "T", (self.tagset_size + 2, self.tagset_size + 2), init=init_transitions)
def __init__(self, pc, conf: NodeExtractorConfGene0, vocab: HLabelVocab, extract_type: str): super().__init__(pc, conf, vocab, extract_type) # decoding # ----- # the two parts: actually in biaffine attention forms # transform embeddings for attention match (token evidence) self.T_tok = self.add_sub_node( "at", Affine(pc, conf.lab_conf.n_dim, conf._input_dim, init_rop=NoDropRop())) # transform embeddings for global match (sent evidence) self.T_sent = self.add_sub_node( "as", Affine(pc, conf.lab_conf.n_dim, conf._input_dim, init_rop=NoDropRop())) # to be refreshed self.query_tok = None # [L, D] self.query_sent = None # [L, D] # ----- # how to combine the two parts: fix lambda or dynamic gated (with the input features) self.lambda_score_tok = conf.lambda_score_tok if self.lambda_score_tok < 0.: # auto mode: using an MLP (make hidden size equal to input//4) self.score_gate = self.add_sub_node( "mix", get_mlp(pc, [conf._input_dim] * 4, 1, conf._input_dim, hidden_act="elu", final_act="sigmoid", final_init_rop=NoDropRop(), hidden_which_affine=3)) else: self.score_gate = None
def __init__(self, pc, input_dim: int, inputp_dim: int, conf: DparG1DecoderConf, inputter: Inputter): super().__init__(pc, conf, name="dp") self.conf = conf self.inputter = inputter self.input_dim = input_dim self.inputp_dim = inputp_dim # checkout and assign vocab self._check_vocab() # ----- # this step is performed at the embedder, thus still does not influence the inputter self.add_root_token = self.inputter.embedder.add_root_token assert self.add_root_token, "Currently assert this one!!" # todo(+N) # ----- # transform dp space if conf.pre_dp_space > 0: dp_space = conf.pre_dp_space self.pre_aff_m = self.add_sub_node( "pm", Affine(pc, input_dim, dp_space, act=conf.pre_dp_act)) self.pre_aff_h = self.add_sub_node( "ph", Affine(pc, input_dim, dp_space, act=conf.pre_dp_act)) else: dp_space = input_dim self.pre_aff_m = self.pre_aff_h = lambda x: x # dep pairwise scorer: output includes [0, r1) -> [non]+valid_words self.dps_node = self.add_sub_node( "dps", PairScorer(pc, dp_space, dp_space, self.dlab_r1, conf=conf.dps_conf, in_size_pair=inputp_dim)) self.dps_s0_mask = np.array([1.] + [0.] * (self.dlab_r1 - 1)) # [0, 1, ..., 1] # whether detach input? self.no_detach_input = ScheduledValue(f"dpar:no_detach", conf.no_detach_input)
def __init__(self, pc, conf: CandidateExtractorConf, input_enc_dims): super().__init__(pc, None, None) self.conf = conf # scorer self.adp = self.add_sub_node( 'adp', TaskSpecAdp(pc, input_enc_dims, [], conf.hidden_dim)) adp_hidden_size = self.adp.get_output_dims()[0] self.predictor = self.add_sub_node( 'pred', Affine(pc, adp_hidden_size, 2, init_rop=NoDropRop())) # 0 as nil # others self.id_counter = defaultdict( int) # docid->ef-count (make sure unique ef-id) self.valid_hlidx = HLabelIdx(["unk"], [1])
def __init__(self, pc: BK.ParamCollection, slconf: SL1Conf): super().__init__(pc, None, None) self.dim = slconf._input_dim self.use_par = slconf.use_par self.use_chs = slconf.use_chs # parent and children attentional senc self.node_par = self.add_sub_node( "npar", MultiHeadAttention(pc, self.dim, self.dim, self.dim, slconf.sl_par_att)) self.node_chs = self.add_sub_node( "nchs", MultiHeadAttention(pc, self.dim, self.dim, self.dim, slconf.sl_chs_att)) self.ff_par = self.add_sub_node( "par_ff", Affine(pc, self.dim, self.dim, act="tanh")) self.ff_chs = self.add_sub_node( "chs_ff", Affine(pc, self.dim, self.dim, act="tanh")) # todo(note): currently simply sum them! self.mix_marginals_head_count = slconf.mix_marginals_head_count self.mix_marginals_rate = slconf.mix_marginals_rate if slconf.zero_extra_output_params: self.ff_par.zero_params() self.ff_chs.zero_params()
def __init__(self, pc: BK.ParamCollection, rconf: SL0Conf): super().__init__(pc, None, None) self.dim = rconf._input_dim # both input/output dim # padders for child nodes self.chs_start_posi = -rconf.chs_num self.ch_idx_padder = DataPadder(2, pad_vals=0, mask_range=2) # [*, num-ch] self.ch_label_padder = DataPadder(2, pad_vals=0) # self.label_embeddings = self.add_sub_node( "label", Embedding(pc, rconf._num_label, rconf.dim_label, fix_row0=False)) self.dim_label = rconf.dim_label # todo(note): now adopting flatten groupings for basic, and then that is all, no more recurrent features # group 1: [cur, chs, par] -> head_pre_size self.use_chs = rconf.use_chs self.use_par = rconf.use_par self.use_label_feat = rconf.use_label_feat # components (add the parameters anyway) # todo(note): children features: children + (label of mod->children) self.chs_reprer = self.add_sub_node("chs", ChsReprer(pc, rconf)) self.chs_ff = self.add_sub_node( "chs_ff", Affine(pc, self.chs_reprer.get_output_dims()[0], self.dim, act="tanh")) # todo(note): parent features: parent + (label of parent->mod) # todo(warn): always add label related params par_ff_inputs = [self.dim, rconf.dim_label] self.par_ff = self.add_sub_node( "par_ff", Affine(pc, par_ff_inputs, self.dim, act="tanh")) # no other groups anymore! if rconf.zero_extra_output_params: self.par_ff.zero_params() self.chs_ff.zero_params()
def __init__(self, pc: BK.ParamCollection, conf: EmbedderNodeConf, vpack: VocabPackage): super().__init__(pc, None, None) self.conf = conf self.vpack = vpack self.add_root_token = conf.add_root_token # ----- self.nodes = [] # params self.comp_names = [] self.comp_dims = [] # real dims self.berter: Berter2 = None for comp_name, comp_conf in conf.ec_dict.items(): if comp_conf.comp_dim > 0: # directly get the nodes one_node = InputEmbedNode.get_input_embed_node(comp_name, pc, comp_name, comp_conf, conf, vpack) comp_dim = one_node.get_output_dims()[0] # fix dim # especially for berter if comp_name == "bert": assert self.berter is None self.berter = one_node.berter # general steps self.comp_names.append(comp_name) self.nodes.append(self.add_sub_node(f"EC{comp_name}", one_node)) self.comp_dims.append(comp_dim) # final projection? self.has_proj = (conf.emb_proj_dim > 0) if self.has_proj: proj_layer = Affine(self.pc, sum(self.comp_dims), conf.emb_proj_dim, act=conf.emb_proj_act, init_scale=conf.emb_proj_init_scale) if conf.emb_proj_norm: norm_layer = LayerNorm(self.pc, conf.emb_proj_dim) self.final_layer = self.add_sub_node("fl", Sequential(self.pc, [proj_layer, norm_layer])) else: self.final_layer = self.add_sub_node("fl", proj_layer) self.output_dim = conf.emb_proj_dim else: self.final_layer = None self.output_dim = sum(self.comp_dims)
def __init__(self, pc: BK.ParamCollection, econf: EmbedConf, vpack: VocabPackage): super().__init__(pc, None, None) self.conf = econf # repr_sizes = [] # word self.has_word = (econf.dim_word > 0) if self.has_word: npvec = vpack.get_emb( "word") if econf.init_words_from_pretrain else None self.word_embed = self.add_sub_node( "ew", Embedding(self.pc, len(vpack.get_voc("word")), econf.dim_word, npvec=npvec, name="word", freeze=econf.word_freeze)) repr_sizes.append(econf.dim_word) # char self.has_char = (econf.dim_char > 0) if self.has_char: # todo(warn): cnns will also use emb's drop? self.char_embed = self.add_sub_node( "ec", Embedding(self.pc, len(vpack.get_voc("char")), econf.dim_char, name="char")) per_cnn_size = econf.char_cnn_hidden // len(econf.char_cnn_windows) self.char_cnns = [ self.add_sub_node( "cnnc", CnnLayer(self.pc, econf.dim_char, per_cnn_size, z, pooling="max", act="tanh")) for z in econf.char_cnn_windows ] repr_sizes.append(econf.char_cnn_hidden) # posi: absolute positional embeddings self.has_posi = (econf.dim_posi > 0) if self.has_posi: self.posi_embed = self.add_sub_node( "ep", PosiEmbedding(self.pc, econf.dim_posi, econf.posi_clip, econf.posi_fix_sincos, econf.posi_freeze)) repr_sizes.append(econf.dim_posi) # extras: like POS, ... self.dim_extras = econf.dim_extras self.extra_names = econf.extra_names zcheck( len(self.dim_extras) == len(self.extra_names), "Unmatched dims and names!") self.extra_embeds = [] for one_extra_dim, one_name in zip(self.dim_extras, self.extra_names): self.extra_embeds.append( self.add_sub_node( "ext", Embedding(self.pc, len(vpack.get_voc(one_name)), one_extra_dim, npvec=vpack.get_emb(one_name, None), name="extra:" + one_name))) repr_sizes.append(one_extra_dim) # auxes self.dim_auxes = econf.dim_auxes self.fold_auxes = econf.fold_auxes self.aux_overall_gammas = [] self.aux_fold_lambdas = [] for one_aux_dim, one_aux_fold in zip(self.dim_auxes, self.fold_auxes): repr_sizes.append(one_aux_dim) # aux gamma and fold trainable lambdas self.aux_overall_gammas.append(self.add_param("AG", (), 1.)) # scalar self.aux_fold_lambdas.append( self.add_param( "AL", (), [1. / one_aux_fold for _ in range(one_aux_fold)])) # [#fold] # ===== # another projection layer? & set final dim if len(repr_sizes) <= 0: zwarn("No inputs??") # zcheck(len(repr_sizes)>0, "No inputs?") self.repr_sizes = repr_sizes self.has_proj = (econf.emb_proj_dim > 0) if self.has_proj: proj_layer = Affine(self.pc, sum(repr_sizes), econf.emb_proj_dim) if econf.emb_proj_norm: norm_layer = LayerNorm(self.pc, econf.emb_proj_dim) self.final_layer = self.add_sub_node( "fl", Sequential(self.pc, [proj_layer, norm_layer])) else: self.final_layer = self.add_sub_node("fl", proj_layer) self.output_dim = econf.emb_proj_dim else: self.final_layer = None self.output_dim = sum(repr_sizes) # ===== # special MdDropout: dropout the entire last dim (for word, char, extras, but not posi) self.dropmd_word = self.add_sub_node("md", DropoutLastN(pc, lastn=1)) self.dropmd_char = self.add_sub_node("md", DropoutLastN(pc, lastn=1)) self.dropmd_extras = [ self.add_sub_node("md", DropoutLastN(pc, lastn=1)) for _ in self.extra_names ] # dropouts for aux self.drop_auxes = [ self.add_sub_node("aux", Dropout(pc, (one_aux_dim, ))) for one_aux_dim in self.dim_auxes ]
def __init__(self, pc: BK.ParamCollection, input_dim: int, conf: PlainLMNodeConf, inputter: Inputter): super().__init__(pc, conf, name="PLM") self.conf = conf self.inputter = inputter self.input_dim = input_dim self.split_input_blm = conf.split_input_blm # this step is performed at the embedder, thus still does not influence the inputter self.add_root_token = self.inputter.embedder.add_root_token # vocab and padder vpack = inputter.vpack vocab_word = vpack.get_voc("word") # models real_input_dim = input_dim // 2 if self.split_input_blm else input_dim if conf.hid_dim <= 0: # no hidden layer self.l2r_hid_layer = self.r2l_hid_layer = None self.pred_input_dim = real_input_dim else: self.l2r_hid_layer = self.add_sub_node( "l2r_h", Affine(pc, real_input_dim, conf.hid_dim, act=conf.hid_act)) self.r2l_hid_layer = self.add_sub_node( "r2l_h", Affine(pc, real_input_dim, conf.hid_dim, act=conf.hid_act)) self.pred_input_dim = conf.hid_dim # todo(note): unk is the first one above real words self.pred_size = min(conf.max_pred_rank + 1, vocab_word.unk) if conf.tie_input_embeddings: zwarn("Tie all preds in plm with input embeddings!!") self.l2r_pred = self.r2l_pred = None self.inputter_embed_node = self.inputter.embedder.get_node("word") else: self.l2r_pred = self.add_sub_node( "l2r_p", Affine(pc, self.pred_input_dim, self.pred_size, init_rop=NoDropRop())) if conf.tie_bidirect_pred: self.r2l_pred = self.l2r_pred else: self.r2l_pred = self.add_sub_node( "r2l_p", Affine(pc, self.pred_input_dim, self.pred_size, init_rop=NoDropRop())) self.inputter_embed_node = None if conf.init_pred_from_pretrain: npvec = vpack.get_emb("word") if npvec is None: zwarn( "Pretrained vector not provided, skip init pred embeddings!!" ) else: with BK.no_grad_env(): self.l2r_pred.ws[0].copy_( BK.input_real(npvec[:self.pred_size].T)) self.r2l_pred.ws[0].copy_( BK.input_real(npvec[:self.pred_size].T)) zlog( f"Init pred embeddings from pretrained vectors (size={self.pred_size})." )
def __init__(self, pc, conf: HLabelNodeConf, hl_vocab: HLabelVocab, eff_max_layer=None): super().__init__(pc, None, None) self.conf = conf self.hl_vocab = hl_vocab assert self.hl_vocab.nil_as_zero # for each layer, the idx=0 is the full-NIL # basic pool embeddings npvec = hl_vocab.pool_init_vec if not conf.pool_init_hint: npvec = None else: assert npvec is not None, "pool-init not provided by the Vocab!" n_dim, n_pool = conf.n_dim, len(hl_vocab.pools_k) self.pool_pred = self.add_sub_node( "pp", Embedding( pc, n_pool, n_dim, fix_row0=conf.zero_nil, npvec=npvec, init_rop=(NoDropRop() if conf.nodrop_pred_embeds else None))) if conf.tie_embeds: self.pool_lookup = self.pool_pred else: self.pool_lookup = self.add_sub_node( "pl", Embedding(pc, n_pool, n_dim, fix_row0=conf.zero_nil, npvec=npvec, init_rop=(NoDropRop() if conf.nodrop_lookup_embeds else None))) # layered labels embeddings (to be refreshed) self.max_layer = hl_vocab.max_layer self.layered_embeds_pred = [None] * self.max_layer self.layered_embeds_lookup = [None] * self.max_layer self.layered_prei = [ None ] * self.max_layer # previous layer i, for score combining self.layered_isnil = [None] * self.max_layer # whether is nil(None) self.zero_nil = conf.zero_nil # lookup summer assert conf.strategy_predict == "sum" self.lookup_is_sum, self.lookup_is_ff = [ conf.strategy_lookup == z for z in ["sum", "ff"] ] if self.lookup_is_ff: self.lookup_summer = self.add_sub_node( "summer", Affine(pc, [n_dim] * self.max_layer, n_dim, act="tanh")) elif self.lookup_is_sum: self.sum_dropout = self.add_sub_node("sdrop", Dropout(pc, (n_dim, ))) self.lookup_summer = lambda embeds: self.sum_dropout( BK.stack(embeds, 0).sum(0)) else: raise NotImplementedError( f"UNK strategy_lookup: {conf.strategy_lookup}") # bias for prediction self.prediction_sizes = [ len(hl_vocab.layered_pool_links_padded[i]) for i in range(self.max_layer) ] if conf.bias_predict: self.biases_pred = [ self.add_param(name="B", shape=(x, )) for x in self.prediction_sizes ] else: self.biases_pred = [None] * self.max_layer # ===== # training self.is_hinge_loss, self.is_prob_loss = [ conf.loss_function == z for z in ["hinge", "prob"] ] self.loss_lambdas = conf.loss_lambdas + [1.] * ( self.max_layer - len(conf.loss_lambdas)) # loss scale self.margin_lambdas = conf.margin_lambdas + [0.] * ( self.max_layer - len(conf.margin_lambdas)) # margin scale self.lookup_soft_alphas = conf.lookup_soft_alphas + [1.] * ( self.max_layer - len(conf.lookup_soft_alphas)) self.loss_fullnil_weight = conf.loss_fullnil_weight # ====== # set current effective max_layer self.eff_max_layer = self.max_layer if eff_max_layer is not None: self.set_eff_max_layer(eff_max_layer)
def __init__(self, pc, dim: int, conf: VRecConf): super().__init__(pc, None, None) self.conf = conf self.dim = dim # ===== # Feat if conf.feat_mod == "matt": self.feat_node = self.add_sub_node( "feat", MAttNode(pc, dim, dim, dim, conf.matt_conf)) self.attn_count = conf.matt_conf.head_count elif conf.feat_mod == "fcomb": self.feat_node = self.add_sub_node( "feat", FCombNode(pc, dim, dim, dim, conf.fc_conf)) self.attn_count = conf.fc_conf.fc_count else: raise NotImplementedError() feat_out_dim = self.feat_node.get_output_dims()[0] # ===== # Combiner if conf.comb_mode == "affine": self.comb_aff = self.add_sub_node( "aff", AffineCombiner(pc, [dim, feat_out_dim], [conf.comb_affine_q, conf.comb_affine_v], dim, out_act=conf.comb_affine_act, out_drop=conf.comb_affine_drop)) self.comb_f = lambda q, v, c: (self.comb_aff([q, v]), None) elif conf.comb_mode == "lstm": self.comb_lstm = self.add_sub_node( "lstm", LstmNode2(pc, feat_out_dim, dim)) self.comb_f = self._call_lstm else: raise NotImplementedError() # ===== # ff if conf.ff_dim > 0: self.has_ff = True self.linear1 = self.add_sub_node( "l1", Affine(pc, dim, conf.ff_dim, act=conf.ff_act, init_rop=NoDropRop())) self.dropout1 = self.add_sub_node( "d1", Dropout(pc, (conf.ff_dim, ), fix_rate=conf.ff_drop)) self.linear2 = self.add_sub_node( "l2", Affine(pc, conf.ff_dim, dim, act="linear", init_rop=NoDropRop())) self.dropout2 = self.add_sub_node( "d2", Dropout(pc, (dim, ), fix_rate=conf.ff_drop)) else: self.has_ff = False # layer norms if conf.use_pre_norm: self.att_pre_norm = self.add_sub_node("aln1", LayerNorm(pc, dim)) self.ff_pre_norm = self.add_sub_node("fln1", LayerNorm(pc, dim)) else: self.att_pre_norm = self.ff_pre_norm = None if conf.use_post_norm: self.att_post_norm = self.add_sub_node("aln2", LayerNorm(pc, dim)) self.ff_post_norm = self.add_sub_node("fln2", LayerNorm(pc, dim)) else: self.att_post_norm = self.ff_post_norm = None
def __init__(self, pc, conf: LinkerConf, vocab: HLabelVocab): super().__init__(pc, None, None) self.conf = conf self.vocab = vocab assert vocab.nil_as_zero assert len( vocab.layered_hlidx) == 1, "Currently we only allow one layer role" self.hl_output_size = len( vocab.layered_hlidx[0]) # num of output labels # ----- # models sconf = conf input_dim = sconf._input_dim dim_label = sconf.dim_label arc_space = sconf.arc_space lab_space = sconf.lab_space ff_hid_size = sconf.ff_hid_size ff_hid_layer = sconf.ff_hid_layer use_biaffine = sconf.use_biaffine use_ff = sconf.use_ff use_ff2 = sconf.use_ff2 biaffine_div = sconf.biaffine_div biaffine_init_ortho = sconf.biaffine_init_ortho transform_act = sconf.transform_act # self.input_dim = input_dim self.num_label = self.hl_output_size # label embeddings self.emb_ef = self.add_sub_node( "eef", Embedding(pc, conf._num_ef_label, dim_label, fix_row0=sconf.zero_unk_lemb)) self.emb_evt = self.add_sub_node( "eevt", Embedding(pc, conf._num_evt_label, dim_label, fix_row0=sconf.zero_unk_lemb)) # attach/arc self.arc_m = self.add_sub_node( "am", Affine(pc, [input_dim, dim_label], arc_space, act=transform_act)) self.arc_h = self.add_sub_node( "ah", Affine(pc, [input_dim, dim_label], arc_space, act=transform_act)) self.arc_scorer = self.add_sub_node( "as", BiAffineScorer(pc, arc_space, arc_space, 1, ff_hid_size, ff_hid_layer=ff_hid_layer, use_biaffine=use_biaffine, use_ff=use_ff, use_ff2=use_ff2, biaffine_div=biaffine_div, biaffine_init_ortho=biaffine_init_ortho)) # labeling self.lab_m = self.add_sub_node( "lm", Affine(pc, [input_dim, dim_label], lab_space, act=transform_act)) self.lab_h = self.add_sub_node( "lh", Affine(pc, [input_dim, dim_label], lab_space, act=transform_act)) self.lab_scorer = self.add_sub_node( "ls", BiAffineScorer(pc, lab_space, lab_space, self.num_label, ff_hid_size, ff_hid_layer=ff_hid_layer, use_biaffine=use_biaffine, use_ff=use_ff, use_ff2=use_ff2, biaffine_div=biaffine_div, biaffine_init_ortho=biaffine_init_ortho)) # self.nil_mask = None
def __init__(self, pc: BK.ParamCollection, input_dim: int, conf: MaskLMNodeConf, inputter: Inputter): super().__init__(pc, conf, name="MLM") self.conf = conf self.inputter = inputter self.input_dim = input_dim # this step is performed at the embedder, thus still does not influence the inputter self.add_root_token = self.inputter.embedder.add_root_token # vocab and padder vpack = inputter.vpack vocab_word, vocab_pos = vpack.get_voc("word"), vpack.get_voc("pos") # no mask fields self.nomask_names_set = set(conf.nomask_names) # models if conf.hid_dim <= 0: # no hidden layer self.hid_layer = None self.pred_input_dim = input_dim else: self.hid_layer = self.add_sub_node( "hid", Affine(pc, input_dim, conf.hid_dim, act=conf.hid_act)) self.pred_input_dim = conf.hid_dim # todo(note): unk is the first one above real words self.pred_word_size = min(conf.max_pred_rank + 1, vocab_word.unk) self.pred_pos_size = vocab_pos.unk if conf.tie_input_embeddings: zwarn("Tie all preds in mlm with input embeddings!!") self.pred_word_layer = self.pred_pos_layer = None self.inputter_word_node = self.inputter.embedder.get_node("word") self.inputter_pos_node = self.inputter.embedder.get_node("pos") else: self.inputter_word_node, self.inputter_pos_node = None, None self.pred_word_layer = self.add_sub_node( "pw", Affine(pc, self.pred_input_dim, self.pred_word_size, init_rop=NoDropRop())) self.pred_pos_layer = self.add_sub_node( "pp", Affine(pc, self.pred_input_dim, self.pred_pos_size, init_rop=NoDropRop())) if conf.init_pred_from_pretrain: npvec = vpack.get_emb("word") if npvec is None: zwarn( "Pretrained vector not provided, skip init pred embeddings!!" ) else: with BK.no_grad_env(): self.pred_word_layer.ws[0].copy_( BK.input_real(npvec[:self.pred_word_size].T)) zlog( f"Init pred embeddings from pretrained vectors (size={self.pred_word_size})." ) # ===== COMBINE_METHOD_FS = { "sum": lambda xs: BK.stack(xs, -1).sum(-1), "avg": lambda xs: BK.stack(xs, -1).mean(-1), "min": lambda xs: BK.stack(xs, -1).min(-1)[0], "max": lambda xs: BK.stack(xs, -1).max(-1)[0], } self.loss_comb_f = COMBINE_METHOD_FS[conf.loss_comb_method] self.score_comb_f = COMBINE_METHOD_FS[conf.score_comb_method]