Exemplo n.º 1
0
    def __init__(self,
                 rnn_type: str = 'lstm',
                 dec_hidden_size: int = 100,
                 dec_input_size: int = 50,
                 dropout: float = 0.1,
                 fixed_dec_step: int = -1,
                 max_dec_steps: int = 2,
                 min_dec_steps: int = 2,
                 schedule_ratio_from_ground_truth: float = 0.5,
                 dec_avd_trigram_rep: bool = True,
                 mult_orac_sample_one: bool = True,
                 abs_board_file="/home/cc/exComp/board.txt",
                 valid_tmp_path='/scratch/cluster/jcxu/exComp',
                 serilization_name: str = ""):
        super().__init__()
        self.device = get_device()
        self._rnn_type = rnn_type
        self._dec_input_size = dec_input_size
        self._dec_hidden_size = dec_hidden_size

        self.fixed_dec_step = fixed_dec_step
        if fixed_dec_step == -1:
            self.min_dec_steps = min_dec_steps
            self.max_dec_steps = max_dec_steps
        else:
            self.min_dec_steps, self.max_dec_steps = fixed_dec_step, fixed_dec_step
        self.schedule_ratio_from_ground_truth = schedule_ratio_from_ground_truth
        self.mult_orac_sample_one_as_gt = mult_orac_sample_one
        self._dropout = nn.Dropout(dropout)

        self.rnn = self.build_rnn(
            self._rnn_type,
            self._dec_input_size,
            self._dec_hidden_size,
        )
        self.rnn_init_state_h = torch.nn.Linear(dec_hidden_size,
                                                dec_hidden_size)
        self.rnn_init_state_c = torch.nn.Linear(dec_hidden_size,
                                                dec_hidden_size)

        self.attn = NewAttention(enc_dim=dec_input_size,
                                 dec_dim=dec_hidden_size)
        self.CELoss = torch.nn.CrossEntropyLoss(ignore_index=-1,
                                                reduction='none')  # TODO
        self.rouge_metrics_sent = RougeStrEvaluation(
            name='sent',
            path_to_valid=valid_tmp_path,
            writting_address=valid_tmp_path,
            serilization_name=serilization_name)
        self.dec_avd_trigram_rep = dec_avd_trigram_rep
Exemplo n.º 2
0
    def __init__(self, inp_dim, hid_dim, dropout, nenc_lay=1, gather='sum'):
        super().__init__()
        self.hidden_dim = hid_dim
        self.enc_blstm = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(inp_dim,
                          hid_dim,
                          batch_first=True,
                          bidirectional=True,
                          num_layers=nenc_lay))

        # self._span_encoder = select_gather(gather)
        self._span_encoder = GatherCNN(
            input_dim=self.enc_blstm.get_output_dim(),
            num_filters=5,
            output_dim=self.enc_blstm.get_output_dim())
        self._dropout = torch.nn.Dropout(p=dropout)
        self.device = get_device()
Exemplo n.º 3
0
    def __init__(self,
                 context_dim,
                 dec_state_dim,
                 enc_hid_dim,
                 text_field_embedder,
                 aggressive_compression: int = -1,
                 keep_threshold: float = 0.5,
                 abs_board_file="/home/cc/exComp/board.txt",
                 gather='mean',
                 dropout=0.5,
                 dropout_emb=0.2,
                 valid_tmp_path='/scratch/cluster/jcxu/exComp',
                 serilization_name: str = "",
                 vocab=None,
                 elmo: bool = False,
                 elmo_weight: str = "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"):
        super().__init__()
        self.use_elmo = elmo
        self.serilization_name = serilization_name
        if elmo:
            from allennlp.modules.elmo import Elmo, batch_to_ids
            from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
            self.vocab = vocab

            options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
            weight_file = elmo_weight
            self.elmo = Elmo(options_file, weight_file, 1, dropout=dropout_emb)
            # print(self.elmo.get_output_dim())
            # self.word_emb_dim = text_field_embedder.get_output_dim()
            # self._context_layer = PytorchSeq2SeqWrapper(
            #     torch.nn.LSTM(self.word_emb_dim + self.elmo.get_output_dim(), self.word_emb_dim,
            #                   batch_first=True, bidirectional=True))
            self.word_emb_dim = self.elmo.get_output_dim()
        else:
            self._text_field_embedder = text_field_embedder
            self.word_emb_dim = text_field_embedder.get_output_dim()

        self.XEloss = torch.nn.CrossEntropyLoss(reduction='none')
        self.device = get_device()

        # self.rouge_metrics_compression = RougeStrEvaluation(name='cp', path_to_valid=valid_tmp_path,
        #                                                     writting_address=valid_tmp_path,
        #                                                     serilization_name=serilization_name)
        # self.rouge_metrics_compression_best_possible = RougeStrEvaluation(name='cp_ub', path_to_valid=valid_tmp_path,
        #                                                                   writting_address=valid_tmp_path,
        #                                                                   serilization_name=serilization_name)
        self.enc = EncCompression(inp_dim=self.word_emb_dim, hid_dim=enc_hid_dim, gather=gather)  # TODO dropout

        self.aggressive_compression = aggressive_compression
        self.relu = torch.nn.ReLU()

        self.attn = NewAttention(enc_dim=self.enc.get_output_dim(),
                                 dec_dim=self.enc.get_output_dim_unit() * 2 + dec_state_dim)

        self.concat_size = self.enc.get_output_dim() + self.enc.get_output_dim_unit() * 2 + dec_state_dim
        self.valid_tmp_path = valid_tmp_path
        if self.aggressive_compression < 0:
            self.XELoss = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=-1)
            # self.nn_lin = torch.nn.Linear(self.concat_size, self.concat_size)
            # self.nn_lin2 = torch.nn.Linear(self.concat_size, 2)

            self.ff = FeedForward(input_dim=self.concat_size, num_layers=3,
                                  hidden_dims=[self.concat_size, self.concat_size, 2],
                                  activations=[torch.nn.Tanh(), torch.nn.Tanh(), lambda x: x],
                                  dropout=dropout
                                  )
            # Keep thresold

            # self.keep_thres = list(np.arange(start=0.2, stop=0.6, step=0.075))
            self.keep_thres = [0.0, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 1.0]
            self.rouge_metrics_compression_dict = OrderedDict()
            for thres in self.keep_thres:
                self.rouge_metrics_compression_dict["{}".format(thres)] = RougeStrEvaluation(name='cp_{}'.format(thres),
                                                                                             path_to_valid=valid_tmp_path,
                                                                                             writting_address=valid_tmp_path,
                                                                                             serilization_name=serilization_name)
Exemplo n.º 4
0
def build_model(
        vocab, embed_dim: int = 100,
        hid_dim: int = 100,
        min_dec_step: int = 2,
        max_decoding_steps: int = 3,
        fix_edu_num: int = -1,
        use_elmo: bool = False,
        dropout=0.5,
        dropout_emb=0.2, span_encoder_type='self_attentive',
        attn_type='dot',
        schedule_ratio_from_ground_truth=0.7,
        pretrain_embedding=None,
        nenc_lay: int = 1,
        mult_orac_sampling: bool = True,
        compression: bool = True,
        word_token_indexers=None,
        alpha: float = 1.0,
        dbg: bool = False,
        dec_avd_trigram_rep: bool = True,
        aggressive_compression: int = -1,
        keep_threshold: float = 0.5,
        weight_alpha=0.0,
        bias_alpha=0.0,
        abs_board_file: str = "/home/cc/exComp/board.txt",
        compress_leadn=-1,
        gather='mean',
        abs_dir_root: str = "/scratch/cluster/jcxu",
        serilization_name="",
        load_save_model: str = None
):
    model = Seq2IdxSum(
        vocab=vocab,
        word_embedding_dim=embed_dim,
        hidden_dim=hid_dim, min_dec_step=min_dec_step,
        max_decoding_steps=max_decoding_steps,
        fix_edu_num=fix_edu_num,
        use_elmo=use_elmo, span_encoder_type=span_encoder_type,
        dropout=dropout, dropout_emb=dropout_emb,
        attn_type=attn_type,
        schedule_ratio_from_ground_truth=schedule_ratio_from_ground_truth,
        pretrain_embedding_file=pretrain_embedding,
        nenc_lay=nenc_lay,
        mult_orac_sampling=mult_orac_sampling,
        word_token_indexers=word_token_indexers,
        compression=compression, alpha=alpha,
        dbg=dbg,
        dec_avd_trigram_rep=dec_avd_trigram_rep,
        aggressive_compression=aggressive_compression,
        keep_threshold=keep_threshold,
        regularizer=RegularizerApplicator([("weight", L2Regularizer(weight_alpha)),
                                           ("bias", L1Regularizer(bias_alpha))]),
        abs_board_file=abs_board_file,
        gather=gather,
        compress_leadn=compress_leadn,
        abs_dir_root=abs_dir_root,
        serilization_name=serilization_name
    )
    if load_save_model:
        model.load_state_dict(torch.load(load_save_model, map_location=get_device()))
    #         `` model.load_state_dict(torch.load("/path/to/model/weights.th"))``

    # model = torch.nn.DataParallel(model)
    device = get_device()
    model = model.to(device)
    return model
Exemplo n.º 5
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
        word_embedding_dim: int = 200,
        hidden_dim: int = 200,
        dropout_emb: float = 0.5,
        min_dec_step: int = 2,
        max_decoding_steps=3,
        fix_edu_num=-1,
        dropout: float = 0.5,
        alpha: float = 0.5,
        span_encoder_type='self_attentive',
        use_elmo: bool = True,
        attn_type: str = 'general',
        schedule_ratio_from_ground_truth: float = 0.8,
        pretrain_embedding_file=None,
        nenc_lay: int = 2,
        mult_orac_sampling: bool = False,
        word_token_indexers=None,
        compression: bool = True,
        dbg: bool = False,
        dec_avd_trigram_rep: bool = True,
        aggressive_compression: int = -1,
        compress_leadn: int = -1,
        subsentence: bool = False,
        gather='mean',
        keep_threshold: float = 0.5,
        abs_board_file: str = "/home/cc/exComp/board.txt",
        abs_dir_root: str = "/scratch/cluster/jcxu",
        serilization_name: str = "",
    ) -> None:

        super(Seq2IdxSum, self).__init__(vocab, regularizer)
        self.text_field_embedder = text_field_embedder

        elmo_weight = os.path.join(
            abs_dir_root, "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5")
        # if not os.path.isfile(elmo_weight):
        #     import subprocess
        #     x = "wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5 -P {}".format(abs_dir_root)
        #     subprocess.run(x.split(" "))

        self.device = get_device()
        self.vocab = vocab
        self.dbg = dbg
        self.loss_thres = keep_threshold
        self.compression = compression
        self.comp_leadn = compress_leadn
        # Just encode the whole document without looking at compression options
        self.enc_doc = EncDoc(inp_dim=word_embedding_dim,
                              hid_dim=hidden_dim,
                              vocab=vocab,
                              dropout=dropout,
                              dropout_emb=dropout_emb,
                              pretrain_embedding_file=pretrain_embedding_file,
                              gather=gather)

        self.sent_dec = SentRNNDecoder(
            rnn_type='lstm',
            dec_hidden_size=self.enc_doc.get_output_dim(),
            dec_input_size=self.enc_doc.get_output_dim(),
            dropout=dropout,
            fixed_dec_step=fix_edu_num,
            max_dec_steps=max_decoding_steps,
            min_dec_steps=min_dec_step,
            schedule_ratio_from_ground_truth=schedule_ratio_from_ground_truth,
            dec_avd_trigram_rep=dec_avd_trigram_rep,
            mult_orac_sample_one=mult_orac_sampling,
            abs_board_file=abs_board_file,
            valid_tmp_path=abs_dir_root,
            serilization_name=serilization_name)
        if compression:
            self.compression_dec = CompressDecoder(
                context_dim=hidden_dim * 2,
                dec_state_dim=hidden_dim * 2,
                enc_hid_dim=hidden_dim,
                text_field_embedder=self.enc_doc._text_field_embedder,
                aggressive_compression=aggressive_compression,
                keep_threshold=keep_threshold,
                abs_board_file=abs_board_file,
                gather=gather,
                dropout=dropout,
                dropout_emb=dropout_emb,
                valid_tmp_path=abs_dir_root,
                serilization_name=serilization_name,
                vocab=vocab,
                elmo=use_elmo,
                elmo_weight=elmo_weight)
            self.aggressive_compression = aggressive_compression

        self.use_elmo = use_elmo
        if use_elmo:
            options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
            weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
            self.elmo = Elmo(options_file, weight_file, 1, dropout=0)
            # print(self.elmo.get_output_dim())
            self._context_layer = PytorchSeq2SeqWrapper(
                torch.nn.LSTM(word_embedding_dim + self.elmo.get_output_dim(),
                              hidden_dim,
                              batch_first=True,
                              bidirectional=True))
        else:

            self._context_layer = PytorchSeq2SeqWrapper(
                torch.nn.LSTM(word_embedding_dim,
                              hidden_dim,
                              batch_first=True,
                              bidirectional=True))

        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=word_embedding_dim)
        if pretrain_embedding_file is not None:
            logger = logging.getLogger()
            logger.info(
                "Loading word embedding: {}".format(pretrain_embedding_file))
            token_embedding.from_params(vocab=vocab,
                                        params=Params({
                                            "pretrained_file":
                                            pretrain_embedding_file,
                                            "embedding_dim":
                                            word_embedding_dim
                                        }))
        self._text_field_embedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        # if span_encoder_type == 'self_attentive':
        #     self._span_encoder = SelfAttentiveSpanExtractor(
        #         self._context_layer.get_output_dim()
        #     )
        # else:
        #     raise NotImplementedError

        self._dropout = torch.nn.Dropout(p=dropout)
        self._max_decoding_steps = max_decoding_steps
        self._fix_edu_num = fix_edu_num
        if compression:
            pass
            # self.rouge_metrics_compression = self.compression_dec.rouge_metrics_compression
            # self.rouge_metrics_compression_upper_bound = self.compression_dec.rouge_metrics_compression_best_possible
        self.rouge_metrics_sent = self.sent_dec.rouge_metrics_sent
        self.mult_orac_sampling = mult_orac_sampling
        self.alpha = alpha
        initializer(self)
        if regularizer is not None:
            regularizer(self)
        self.counter = 0  # used for controlling compression and extraction