Exemplo n.º 1
0
    def __init__(self,
                 vocab: Vocabulary,
                 sentence_encoder: SentenceEncoder,
                 clause_embedding_dim: int,
                 slot_embedding_dim: int,
                 span_selector: SpanSelector,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None):
        super(ClauseAnsweringModel, self).__init__(vocab, regularizer)
        self._sentence_encoder = sentence_encoder
        self._clause_embedding_dim = clause_embedding_dim
        self._slot_embedding_dim = slot_embedding_dim
        self._span_selector = span_selector
        self._question_embedding_dim = span_selector.get_extra_input_dim()

        self._clause_embedding = Embedding(
            vocab.get_vocab_size("clause-template-labels"),
            clause_embedding_dim)
        self._slot_embedding = Embedding(
            vocab.get_vocab_size("answer-slot-labels"), slot_embedding_dim)

        self._combined_embedding_dim = self._sentence_encoder.get_output_dim() + \
                                       self._clause_embedding_dim + \
                                       self._slot_embedding_dim
        self._question_projection = Linear(self._combined_embedding_dim,
                                           self._question_embedding_dim)

        if self._question_embedding_dim == 0:
            raise ConfigurationError(
                "Question embedding dim (span selector extra input dim) cannot be 0"
            )
Exemplo n.º 2
0
def build_model(vocab: Vocabulary, bert_model: str = None) -> Model:
    if bert_model:
        embedder = BasicTextFieldEmbedder({"bert": PretrainedTransformerEmbedder(model_name=bert_model,
                                                                                 train_parameters=True)})
        encoder = BertPooler(pretrained_model=bert_model, requires_grad=True)
    else:
        # (3) How to get vectors for each Token ID:
        # (3.1) embed each token
        token_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("token_vocab"))
        # pretrained_file='https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz'

        # (3.2) embed each character in each token
        character_embedding = Embedding(embedding_dim=3, num_embeddings=vocab.get_vocab_size("character_vocab"))
        cnn_encoder = CnnEncoder(embedding_dim=3, num_filters=4, ngram_filter_sizes=[3,])
        token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder)
        # (3.3) embed the POS of each token
        pos_tag_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("pos_tag_vocab"))

        # Each TokenEmbedders embeds its input, and the result is concatenated in an arbitrary (but consistent) order
        # cf: https://docs.allennlp.org/master/api/modules/text_field_embedders/basic_text_field_embedder/
        embedder = BasicTextFieldEmbedder(
            token_embedders={"tokens": token_embedding,
                             "token_characters": token_encoder,
                             "pos_tags": pos_tag_embedding}
        )  # emb_dim = 10 + 4 + 10 = 24
        encoder = BagOfEmbeddingsEncoder(embedding_dim=24, averaged=True)
        #                                                  ^
        # average the embeddings across time, rather than simply summing
        # (ie. we will divide the summed embeddings by the length of the sentence).
    return SimpleClassifier(vocab, embedder, encoder)
Exemplo n.º 3
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 context_layer: Seq2SeqEncoder,
                 mention_feedforward: FeedForward,
                 antecedent_feedforward: FeedForward,
                 feature_size: int,
                 max_span_width: int,
                 spans_per_word: float,
                 max_antecedents: int,
                 lexical_dropout: float = 0.2,
                 context_layer_back: Seq2SeqEncoder = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(CoreferenceResolver, self).__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder
        self._context_layer = context_layer
        self._context_layer_back = context_layer_back
        self._antecedent_feedforward = TimeDistributed(antecedent_feedforward)
        feedforward_scorer = torch.nn.Sequential(
            TimeDistributed(mention_feedforward),
            TimeDistributed(
                torch.nn.Linear(mention_feedforward.get_output_dim(), 1)))
        self._mention_pruner = SpanPruner(feedforward_scorer)
        self._antecedent_scorer = TimeDistributed(
            torch.nn.Linear(antecedent_feedforward.get_output_dim(), 1))
        # TODO check the output dim when two context layers are passed through
        self._endpoint_span_extractor = EndpointSpanExtractor(
            context_layer.get_output_dim(),
            combination="x,y",
            num_width_embeddings=max_span_width,
            span_width_embedding_dim=feature_size,
            bucket_widths=False)
        self._attentive_span_extractor = SelfAttentiveSpanExtractor(
            input_dim=text_field_embedder.get_output_dim())

        # 10 possible distance buckets.
        self._num_distance_buckets = 10
        self._distance_embedding = Embedding(self._num_distance_buckets,
                                             feature_size)
        self._speaker_embedding = Embedding(2, feature_size)
        self.genres = {
            g: i
            for i, g in enumerate(['bc', 'bn', 'mz', 'nw', 'pt', 'tc', 'wb'])
        }
        self._genre_embedding = Embedding(len(self.genres), feature_size)

        self._max_span_width = max_span_width
        self._spans_per_word = spans_per_word
        self._max_antecedents = max_antecedents

        self._mention_recall = MentionRecall()
        self._conll_coref_scores = ConllCorefScores()
        if lexical_dropout > 0:
            self._lexical_dropout = torch.nn.Dropout(p=lexical_dropout)
        else:
            self._lexical_dropout = lambda x: x
        self._feature_dropout = torch.nn.Dropout(0.2)
        initializer(self)
Exemplo n.º 4
0
def get_masked_copynet_with_attention(vocab: Vocabulary,
                                      max_decoding_steps: int = 20,
                                      beam_size: int = 1) -> MaskedCopyNet:

    word_embeddings = Embedding(
        num_embeddings=vocab.get_vocab_size("tokens"),
        embedding_dim=EMB_DIM
    )
    word_embeddings = BasicTextFieldEmbedder({"tokens": word_embeddings})

    masker_embeddings = Embedding(
        num_embeddings=vocab.get_vocab_size("mask_tokens"),
        embedding_dim=MASK_EMB_DIM
    )
    masker_embeddings = BasicTextFieldEmbedder({"tokens": masker_embeddings})

    attention = AdditiveAttention(vector_dim=HID_DIM * 2, matrix_dim=HID_DIM * 2)
    mask_attention = AdditiveAttention(vector_dim=HID_DIM * 2, matrix_dim=MASK_EMB_DIM)
    lstm = PytorchSeq2SeqWrapper(nn.LSTM(EMB_DIM, HID_DIM, batch_first=True, bidirectional=True))

    return MaskedCopyNet(
        vocab=vocab,
        embedder=word_embeddings,
        encoder=lstm,
        max_decoding_steps=max_decoding_steps,
        attention=attention,
        mask_embedder=masker_embeddings,
        mask_attention=mask_attention,
        beam_size=beam_size
    )
Exemplo n.º 5
0
def build_model(vocab: Vocabulary) -> Model: 
    print("Building the model")
    vocab_size_s = vocab.get_vocab_size("source_tokens")
    vocab_size_t = vocab.get_vocab_size("target_tokens") 
    
    bleu = BLEU(exclude_indices = {0,2,3})

    source_text_embedder = BasicTextFieldEmbedder({"source_tokens": Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size_s)})
    encoder = PytorchTransformer(input_dim=embedding_dim, num_layers=num_layers ,positional_encoding="sinusoidal", 
                            feedforward_hidden_dim=dff, num_attention_heads=num_head, positional_embedding_size = embedding_dim, dropout_prob = dropout)

    
    # target_text_embedder = BasicTextFieldEmbedder({"target_tokens":Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size_t)})
    target_text_embedder = Embedding(embedding_dim=embedding_dim, num_embeddings=vocab_size_t)
    decoder_net = StackedSelfAttentionDecoderNet(decoding_dim=embedding_dim, target_embedding_dim=embedding_dim, 
                                feedforward_hidden_dim=dff, num_layers=num_layers, num_attention_heads=num_head, dropout_prob = dropout)
    decoder_net.decodes_parallel=True
    decoder = AutoRegressiveSeqDecoder(
        vocab, decoder_net, max_len, target_text_embedder, 
        target_namespace="target_tokens", tensor_based_metric=bleu, scheduled_sampling_ratio=0.0)
    
    if args.pseudo:
        decoder = PseudoAutoRegressiveSeqDecoder(vocab, decoder_net, max_len, target_text_embedder, target_namespace="target_tokens", tensor_based_metric=bleu, scheduled_sampling_ratio=0.0, decoder_lin_emb = args.dec)
        return PseudoComposedSeq2Seq(vocab, source_text_embedder, encoder, decoder, num_virtual_models = num_virtual_models)
    else:
        decoder = AutoRegressiveSeqDecoder(vocab, decoder_net, max_len, target_text_embedder, target_namespace="target_tokens", tensor_based_metric=bleu, scheduled_sampling_ratio=0.0)
        return ComposedSeq2Seq(vocab, source_text_embedder, encoder, decoder)
Exemplo n.º 6
0
def load_embedding(args, vocab):
    # Randomly initialize vectors
    if args.embedding_type == "None":
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=args.embedding_dim)

    # Load word2vec vectors
    elif args.embedding_type == "w2v":
        embedding_path = args.embedding_path
        save_weight_file = './{}_embedding_weight.pt'.format(args.dataset)
        if os.path.exists(save_weight_file):
            weight = torch.load(save_weight_file)
        else:
            weight = _read_pretrained_embeddings_file(
                embedding_path,
                embedding_dim=args.embedding_dim,
                vocab=vocab,
                namespace="tokens")
            torch.save(weight, save_weight_file)

        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=args.embedding_dim,
            weight=weight,
            trainable=True)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    return word_embeddings
    def _find_model_function(self):
        embedding_dim = self.configuration['embed_size']
        embedding_matrix_filepath = self.base_data_dir + 'embedding_matrix'
        if os.path.exists(embedding_matrix_filepath):
            embedding_matrix = super()._load_object(embedding_matrix_filepath)
        else:
            embedding_filepath = self.configuration['embedding_filepath']
            embedding_matrix = embedding._read_embeddings_from_text_file(embedding_filepath, embedding_dim,
                                                                         self.vocab, namespace='tokens')
            super()._save_object(embedding_matrix_filepath, embedding_matrix)
        embedding_matrix = embedding_matrix.to(self.configuration['device'])
        token_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size(namespace='tokens'),
                                    embedding_dim=embedding_dim, padding_index=0, vocab_namespace='tokens',
                                    trainable=self._is_train_token_embeddings(), weight=embedding_matrix)
        # the embedder maps the input tokens to the appropriate embedding matrix
        word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding})

        position_embedding = Embedding(num_embeddings=self.vocab.get_vocab_size(namespace='position'),
                                    embedding_dim=self._get_position_embeddings_dim(), padding_index=0)
        position_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({"position": position_embedding},
                                                                    # we'll be ignoring masks so we'll need to set this to True
                                                                    allow_unmatched_keys=True)

        model_function = self._find_model_function_pure()
        model = model_function(
            word_embedder,
            position_embedder,
            self.distinct_polarities,
            self.vocab,
            self.configuration,
        )
        self._print_args(model)
        model = model.to(self.configuration['device'])
        return model
Exemplo n.º 8
0
def get_embedder(type_, vocab, e_dim, rq_grad=False):
    if type_ == 'elmo':
        opt_file = "data/elmo_2x1024_128_2048cnn_1xhighway_options.json"
        wt_file = "data/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
        elmo_embedder = ElmoTokenEmbedder(opt_file,
                                          wt_file,
                                          requires_grad=rq_grad)
        word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
        return word_embeddings
    if type_ == 'glove':
        wt_file = "data/glove.6B.300d.txt"
        glove_embedder = Embedding(400000,
                                   300,
                                   pretrained_file=wt_file,
                                   trainable=rq_grad)
        word_embeddings = BasicTextFieldEmbedder({"tokens": glove_embedder})
        return word_embeddings
    elif type_ == 'bert':
        bert_embedder = PretrainedBertEmbedder(
            pretrained_model="bert-base-uncased",
            top_layer_only=True,
            requires_grad=rq_grad)
        word_embeddings = BasicTextFieldEmbedder({"tokens": bert_embedder},
                                                 allow_unmatched_keys=True)
        return word_embeddings
    else:
        token_embeddings = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=e_dim)
        word_embeddings = BasicTextFieldEmbedder({"tokens": token_embeddings})
        return word_embeddings
Exemplo n.º 9
0
def init_gru(vocab, d_embedding, hidden_rnn_sz, rnn_num_layers,
             rnn_dropout, all_code_types, feedforward_num_layers, feedforward_hidden_dims, feedforward_activations,
             feedforward_dropout, leadlag, add_time, t_max, t_scale, use_timestamps, split_paths):
    """Construct and train GRU"""

    # Init feedward params
    feedforward_hidden_dims = [feedforward_hidden_dims] * feedforward_num_layers
    feedforward_activations = [Activation.by_name(feedforward_activations)()] * feedforward_num_layers
    feedforward_dropout = [feedforward_dropout] * feedforward_num_layers

    # Needed for final layer
    feedforward_num_layers += 1
    feedforward_hidden_dims.append(1)
    feedforward_activations.append(Activation.by_name('linear')())
    feedforward_dropout.append(0)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size(),
                                embedding_dim=d_embedding)

    # Handle Augmentations
    augmentations = []
    if add_time:
        augmentations.append('add_time')
    if leadlag:
        augmentations.append('leadlag')

    d_embedding_updated = update_dims(augmentations, d_embedding)
    i_augmentations = init_augmentations(augmentations, use_timestamps=use_timestamps, t_max=t_max, t_scale=t_scale)

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size(),
                                embedding_dim=d_embedding)

    # Embedder maps the input tokens to the appropriate embedding matrix
    word_embeddings: TextFieldEmbedder = BasicTextFieldEmbedder({"tokens": token_embedding})

    # Encoder takes path of (N, L, C) and encodes into state vector
    # encoder = BagOfEmbeddingsEncoder(embedding_dim=d_embedding)
    encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(
        nn.GRU(d_embedding_updated, hidden_rnn_sz, num_layers=rnn_num_layers, batch_first=True, dropout=rnn_dropout))

    classifier_feedforward: FeedForward = FeedForward(
        input_dim=encoder.get_output_dim() * 3 if (all_code_types and split_paths) else encoder.get_output_dim(),
        num_layers=feedforward_num_layers,
        hidden_dims=feedforward_hidden_dims,
        activations=feedforward_activations,
        dropout=feedforward_dropout
    )

    model = BaseModel(
        vocab,
        word_embeddings,
        encoder,
        classifier_feedforward,
        augmentations=i_augmentations
    )
    return model
Exemplo n.º 10
0
def main():
    # "http://mattmahoney.net/dc/text8.zip" download first
    data_dir = 'data/word2vec/text8/text8'

    # 1. build vocab from file
    vocab = build_vocab(data_dir)

    # 2. build reader
    reader = SimpleSkipGramReader(
        window_size=WIN_SIZE)  # or SkipGramReader(vocab=vocab)
    text8 = reader.read(data_dir)

    embedding_in = Embedding(
        num_embeddings=vocab.get_vocab_size('token_target'),
        embedding_dim=EMBEDDING_DIM)
    embedding_out = Embedding(
        num_embeddings=vocab.get_vocab_size('token_context'),
        embedding_dim=EMBEDDING_DIM)

    if CUDA_DEVICE > -1:
        embedding_in = embedding_in.to(CUDA_DEVICE)
        embedding_out = embedding_out.to(CUDA_DEVICE)

    iterator = BasicIterator(batch_size=BATCH_SIZE)
    iterator.index_with(vocab)  # important, transform token to index

    model = SkipGramNegativeSamplingModel(vocab,
                                          embedding_in,
                                          embedding_out,
                                          neg_samples=10,
                                          cuda_device=CUDA_DEVICE)
    #
    # model = SkipGramModel(vocab=vocab,
    #                       embedding_in=embedding_in,
    #                       cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=text8,
                      num_epochs=5,
                      cuda_device=CUDA_DEVICE)
    trainer.train()

    # write_embeddings(embedding_in, 'data/text8/embeddings.txt', vocab)
    print(get_synonyms('one', embedding_in, vocab))
    print(get_synonyms('december', embedding_in, vocab))
    print(get_synonyms('flower', embedding_in, vocab))
    print(get_synonyms('design', embedding_in, vocab))
    print(get_synonyms('snow', embedding_in, vocab))

    rho = evaluate_embeddings(embedding_in, vocab)
    print('simlex999 speareman correlation: {}'.format(rho))
    def _find_model_function(self):
        embedding_dim = self.configuration['embed_size']
        embedding_matrix_filepath = self.base_data_dir + 'embedding_matrix'
        if os.path.exists(embedding_matrix_filepath):
            embedding_matrix = super()._load_object(embedding_matrix_filepath)
        else:
            embedding_filepath = self.configuration['embedding_filepath']
            embedding_matrix = embedding._read_embeddings_from_text_file(
                embedding_filepath,
                embedding_dim,
                self.vocab,
                namespace='tokens')
            super()._save_object(embedding_matrix_filepath, embedding_matrix)
        token_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size(namespace='tokens'),
            embedding_dim=embedding_dim,
            padding_index=0,
            vocab_namespace='tokens',
            trainable=False,
            weight=embedding_matrix)
        # the embedder maps the input tokens to the appropriate embedding matrix
        word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder(
            {"tokens": token_embedding})

        position_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size(namespace='position'),
            embedding_dim=25,
            padding_index=0)
        position_embedder: TextFieldEmbedder = BasicTextFieldEmbedder(
            {"position": position_embedding},
            # we'll be ignoring masks so we'll need to set this to True
            allow_unmatched_keys=True)

        # bert_embedder = PretrainedBertEmbedder(
        #     pretrained_model=self.bert_file_path,
        #     top_layer_only=True,  # conserve memory
        #     requires_grad=True
        # )
        # bert_word_embedder: TextFieldEmbedder = BasicTextFieldEmbedder({"bert": bert_embedder},
        #                                                                  # we'll be ignoring masks so we'll need to set this to True
        #                                                                  allow_unmatched_keys=True)
        bert_word_embedder = self._get_bert_word_embedder()

        model = pytorch_models.AsMilSimultaneouslyBert(
            word_embedder,
            position_embedder,
            self.distinct_categories,
            self.distinct_polarities,
            self.vocab,
            self.configuration,
            bert_word_embedder=bert_word_embedder)
        self._print_args(model)
        model = model.to(self.configuration['device'])
        return model
Exemplo n.º 12
0
def main():
    reader = SkipGramReader()
    text8 = reader.read('data/text8/text8')

    vocab = Vocabulary.from_instances(text8,
                                      min_count={
                                          'token_in': 5,
                                          'token_out': 5
                                      })

    reader = SkipGramReader(vocab=vocab)
    text8 = reader.read('data/text8/text8')

    embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
                             embedding_dim=EMBEDDING_DIM)
    embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'),
                              embedding_dim=EMBEDDING_DIM)
    if CUDA_DEVICE > -1:
        embedding_in = embedding_in.to(CUDA_DEVICE)
        embedding_out = embedding_out.to(CUDA_DEVICE)
    iterator = BasicIterator(batch_size=BATCH_SIZE)
    iterator.index_with(vocab)

    # model = SkipGramNegativeSamplingModel(
    #     vocab=vocab,
    #     embedding_in=embedding_in,
    #     embedding_out=embedding_out,
    #     neg_samples=10,
    #     cuda_device=CUDA_DEVICE)

    model = SkipGramModel(vocab=vocab,
                          embedding_in=embedding_in,
                          cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=text8,
                      num_epochs=5,
                      cuda_device=CUDA_DEVICE)
    trainer.train()

    # write_embeddings(embedding_in, 'data/text8/embeddings.txt', vocab)
    print(get_synonyms('one', embedding_in, vocab))
    print(get_synonyms('december', embedding_in, vocab))
    print(get_synonyms('flower', embedding_in, vocab))
    print(get_synonyms('design', embedding_in, vocab))
    print(get_synonyms('snow', embedding_in, vocab))

    rho = evaluate_embeddings(embedding_in, vocab)
    print('simlex999 speareman correlation: {}'.format(rho))
Exemplo n.º 13
0
def build_model(vocab: Vocabulary) -> Model:
    print("Building the model")
    EMBEDDING_DIM = 6
    HIDDEN_DIM = 6

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    criterion_embedding = Embedding(
        num_embeddings=7, embedding_dim=EMBEDDING_DIM)  # FIXME: num embeddings
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    return MultiCriterionTokenizer(word_embeddings, criterion_embedding, lstm,
                                   vocab)
Exemplo n.º 14
0
def main():
    reader = SkipGramReader()
    dataset = reader.read("data/cv/0/train.txt")
    vocab = Vocabulary().from_files("data/vocabulary")
    params = Params(params={})
    vocab.extend_from_instances(params, dataset)

    reader = SkipGramReader(vocab=vocab)
    dataset = reader.read("data/cv/0/train.txt")
    embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
                             embedding_dim=EMBEDDING_DIM)
    embedding_out = Embedding(num_embeddings=vocab.get_vocab_size('token_out'),
                              embedding_dim=EMBEDDING_DIM)
    
    
    if CUDA_DEVICE > -1:
        embedding_in = embedding_in.to(CUDA_DEVICE)
        embedding_out = embedding_out.to(CUDA_DEVICE)
    iterator = BasicIterator(batch_size=BATCH_SIZE)
    iterator.index_with(vocab)

    model = SkipGramModel(vocab=vocab,
                          embedding_in=embedding_in,
                          cuda_device=CUDA_DEVICE)

    # model = SkipGramNegativeSamplingModel(
    #     vocab=vocab,
    #     embedding_in=embedding_in,
    #     embedding_out=embedding_out,
    #     neg_samples=10,
    #     cuda_device=CUDA_DEVICE)

    optimizer = optim.Adam(model.parameters())

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=dataset,
                      num_epochs=20,
                      cuda_device=CUDA_DEVICE)
    trainer.train()

    torch.save(embedding_in.state_dict(), "saved_models/word2vec.th")

    print(get_synonyms('C', embedding_in, vocab))
    print(get_synonyms('G7', embedding_in, vocab))
    print(get_synonyms('G', embedding_in, vocab))
    print(get_synonyms('F', embedding_in, vocab))
    print(get_synonyms('C7', embedding_in, vocab))
Exemplo n.º 15
0
def prepare1():
    """
    First part of preparing data for training
    :return: biLSTM model object, biLSTM vocabulary, data for training, data for validation, cuda biLSTM object,
             biLSTM reader object
    """
    reader = PosDatasetReader()
    train_dataset = reader.read(train_path)
    validation_dataset = reader.read(validation_path)

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    EMBEDDING_DIM = 200
    HIDDEN_DIM = 200

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, bidirectional=True))

    model = LstmTagger(word_embeddings, lstm, vocab)
    if torch.cuda.is_available():
        cuda_device = 0
        model = model.cuda(cuda_device)
    else:
        cuda_device = -1

    return model, vocab, train_dataset, validation_dataset, cuda_device, reader
Exemplo n.º 16
0
def build_model(vocab: Vocabulary) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    embedder = BasicTextFieldEmbedder(
        {"tokens": Embedding(embedding_dim=10, num_embeddings=vocab_size)})
    encoder = BagOfEmbeddingsEncoder(embedding_dim=10)
    return SimpleClassifier(vocab, embedder, encoder)
    def __init__(self, vocab: Vocabulary,
                text_field_embedder: TextFieldEmbedder,
                question_generator: QuestionGenerator,
                stacked_encoder: Seq2SeqEncoder = None,
                predicate_feature_dim: int = 100,
                dim_hidden: int = 100,
                embedding_dropout: float = 0.0,
                initializer: InitializerApplicator = InitializerApplicator(),
                regularizer: Optional[RegularizerApplicator] = None):
        super(QuestionPredictor, self).__init__(vocab, regularizer)

        self.dim_hidden = dim_hidden

        self.text_field_embedder = text_field_embedder
        self.predicate_feature_embedding = Embedding(2, predicate_feature_dim)

        self.embedding_dropout = Dropout(p=embedding_dropout)

        self.stacked_encoder = stacked_encoder

        self.span_extractor = EndpointSpanExtractor(self.stacked_encoder.get_output_dim(), combination="x,y")

        self.question_generator = question_generator
        self.slot_labels = question_generator.get_slot_labels()

        self.question_metric = QuestionPredictionMetric(vocab, question_generator.get_slot_labels())
def build_model(vocab: Vocabulary, use_reg: bool = True) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    EMBED_DIMS = 300
    # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings
    embedder = BasicTextFieldEmbedder({
        "tokens":
        Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size)
    })
    encoder = CnnEncoder(
        embedding_dim=EMBED_DIMS,
        ngram_filter_sizes=(2, 3, 4, 5),
        num_filters=5
    )  # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f
    # encoder = BertPooler("bert-base-cased")
    # the output dim is just the num filters *len(ngram_filter_sizes)

    #     construct the regularizer applicator
    regularizer_applicator = None
    if use_reg:
        l2_reg = L2Regularizer()
        regexes = [("embedder", l2_reg), ("encoder", l2_reg),
                   ("classifier", l2_reg)]
        regularizer_applicator = RegularizerApplicator(regexes)

    return DecompensationClassifier(vocab, embedder, encoder,
                                    regularizer_applicator)
Exemplo n.º 19
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 stacked_encoder: Seq2SeqEncoder = None,
                 predicate_feature_dim: int = 0,
                 embedding_dropout: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None):
        super(SentenceEncoder, self).__init__()
        self._text_field_embedder = text_field_embedder
        self._stacked_encoder = stacked_encoder
        self._predicate_feature_dim = predicate_feature_dim
        self._embedding_dropout = Dropout(p=embedding_dropout)

        if self._predicate_feature_dim > 0:
            self._predicate_feature_embedding = Embedding(
                2, predicate_feature_dim)

        if self._stacked_encoder is not None:
            embedding_dim_with_predicate_feature = self._text_field_embedder.get_output_dim(
            ) + self._predicate_feature_dim
            if embedding_dim_with_predicate_feature != self._stacked_encoder.get_input_dim(
            ):
                raise ConfigurationError(
                    ("Input dimension of sentence encoder (%s) must be " % self._stacked_encoder.get_input_dim()) + \
                    ("the sum of predicate feature dim and text embedding dim (%s)." % (embedding_dim_with_predicate_feature)))

        self._metric = BinaryF1()
    def __init__(self,
                 vocab: Vocabulary,
                 sentence_encoder: SentenceEncoder,
                 qarg_ffnn: FeedForward,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None):
        super(ClauseAndSpanToAnswerSlotModel,
              self).__init__(vocab, regularizer)
        self._sentence_encoder = sentence_encoder
        self._qarg_ffnn = qarg_ffnn

        self._clause_embedding = Embedding(
            vocab.get_vocab_size("abst-clause-labels"),
            self._qarg_ffnn.get_input_dim())
        self._span_extractor = EndpointSpanExtractor(
            input_dim=self._sentence_encoder.get_output_dim(),
            combination="x,y")
        self._span_hidden = TimeDistributed(
            Linear(2 * self._sentence_encoder.get_output_dim(),
                   self._qarg_ffnn.get_input_dim()))
        self._predicate_hidden = Linear(
            self._sentence_encoder.get_output_dim(),
            self._qarg_ffnn.get_input_dim())
        self._qarg_predictor = Linear(self._qarg_ffnn.get_output_dim(),
                                      self.vocab.get_vocab_size("qarg-labels"))
        self._metric = BinaryF1()
Exemplo n.º 21
0
def build_model(
        vocab: Vocabulary,
        embedding_dim: int,
        pretrained_file: str = None,
        initializer: InitializerApplicator = None,
        regularizer: RegularizerApplicator = None
        ) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    word_vec = Embedding(embedding_dim=embedding_dim,
                          num_embeddings=vocab_size,
                          pretrained_file=pretrained_file,
                          vocab=vocab)
    embedding = BasicTextFieldEmbedder({"tokens": word_vec})

    # Use ELMo
    # options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
    # weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'
    # elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
    # embedding = BasicTextFieldEmbedder({"tokens": elmo_embedder})

    # Use BERT
    # bert_embedder = PretrainedTransformerEmbedder(
    #     model_name='bert-base-uncased',
    #     max_length=512,
    #     train_parameters=False
    # )
    # embedding = BasicTextFieldEmbedder({"tokens": bert_embedder})

    encoder = BagOfEmbeddingsEncoder(embedding_dim=embedding_dim)
    return SimpleClassifier(vocab, embedding, encoder, initializer, regularizer=regularizer)
Exemplo n.º 22
0
def running_whole_model():
    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

    iterator = BucketIterator(batch_size=BATCH_SIZE, sorting_keys=[("sentence", "num_tokens"),
                                                                   ("structures1", "num_tokens"),
                                                                   ("structures2", "num_tokens"),
                                                                   ("structures3", "num_tokens")])
    iterator.index_with(vocab)


    model = All_generating(embed_size=EMBEDDING_DIM,
                           word_embeddings=word_embeddings,
                           vocab=vocab,
                           num_of_candidates=7,
                           )

    # optimizer = adabound.AdaBound(model.parameters(), lr=lr, final_lr=0.1)
    optimizer = optim.Adam(model.parameters(), lr=lr)


    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=whole_train_dataset,
                      validation_dataset=whole_validation_dataset,
                      patience=5,
                      num_epochs=30)
    trainer.train()
Exemplo n.º 23
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 target_namespace: str = "tokens",
                 target_embedding_dim: int = None,
                 scheduled_sampling_ratio: float = 0.0) -> None:
        super(dTPRxNet, self).__init__(vocab)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._decoder_output_dim = self._encoder.get_output_dim()
        target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)

        self._decoder_input_dim = target_embedding_dim
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
Exemplo n.º 24
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 stacked_encoder: Seq2SeqEncoder,
                 binary_feature_dim: int,
                 initializer: InitializerApplicator,
                 embedding_dropout: float = 0.0) -> None:
        super(SemanticRoleLabeler, self).__init__(vocab)

        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")

        # For the span based evaluation, we don't want to consider labels
        # for verb, because the verb index is provided to the model.
        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace="labels",
                                              ignore_classes=["V"])

        self.stacked_encoder = stacked_encoder
        # There are exactly 2 binary features for the verb predicate embedding.
        self.binary_feature_embedding = Embedding(2, binary_feature_dim)
        self.tag_projection_layer = TimeDistributed(
            Linear(self.stacked_encoder.get_output_dim(), self.num_classes))
        self.embedding_dropout = Dropout(p=embedding_dropout)
        initializer(self)

        if text_field_embedder.get_output_dim(
        ) + binary_feature_dim != stacked_encoder.get_input_dim():
            raise ConfigurationError(
                "The SRL Model uses a binary verb indicator feature, meaning "
                "the input dimension of the stacked_encoder must be equal to "
                "the output dimension of the text_field_embedder + 1.")
Exemplo n.º 25
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 binary_feature_dim: int,
                 embedding_dropout: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 label_smoothing: float = None,
                 ignore_span_metric: bool = False) -> None:
        super(SemanticRoleLabeler, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")

        # For the span based evaluation, we don't want to consider labels
        # for verb, because the verb index is provided to the model.
        self.span_metric = SpanBasedF1Measure(vocab, tag_namespace="labels", ignore_classes=["V"])

        self.encoder = encoder
        # There are exactly 2 binary features for the verb predicate embedding.
        self.binary_feature_embedding = Embedding(2, binary_feature_dim)
        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
                                                           self.num_classes))
        self.embedding_dropout = Dropout(p=embedding_dropout)
        self._label_smoothing = label_smoothing
        self.ignore_span_metric = ignore_span_metric

        check_dimensions_match(text_field_embedder.get_output_dim() + binary_feature_dim,
                               encoder.get_input_dim(),
                               "text embedding dim + verb indicator embedding dim",
                               "encoder input dim")
        initializer(self)
Exemplo n.º 26
0
def running_NER():
    reader = PosDatasetReader()
    train_dataset = reader.read('../data/700_multi_data/600_ner_train.txt')
    validation_dataset = reader.read('../data/700_multi_data/66_ner_test.txt')

    vocab = Vocabulary.from_files("../model_store/vocabulary")

    # '''vocab part'''
    # train_1 = reader.read('../data/train/train.json')
    # train_2 = reader.read('../data/train/dev.json')

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
    model = LstmTagger(word_embeddings, lstm, vocab)
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    iterator = BucketIterator(batch_size=2,
                              sorting_keys=[("sentence", "num_tokens")])
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
                      num_epochs=1000)
    trainer.train()
Exemplo n.º 27
0
 def build_model(vocab: Vocabulary) -> Model:
     vocab_size = vocab.get_vocab_size(
         "tokens")  # "tokens" from data_reader.token_indexers ??
     embedder = BasicTextFieldEmbedder(
         {"tokens": Embedding(embedding_dim=10, num_embeddings=vocab_size)})
     encoder = BagOfEmbeddingsEncoder(embedding_dim=10)
     return SimpleClassifier(vocab, embedder, encoder)
Exemplo n.º 28
0
def generate_res_file():
    reader = PosDatasetReader()
    vocab = Vocabulary.from_files("../model_store/vocabulary")

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

    model2 = LstmTagger(word_embeddings, lstm, vocab)

    with open("../model_store/model.th", 'rb') as f:
        model2.load_state_dict(torch.load(f))
    predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)

    train_read_file = open('../data/only_sentence/raw_test.json', 'r')
    train_write_file = open('../data/only_sentence/ner_test.json', 'w')
    for line in train_read_file:
        tag_logits2 = predictor2.predict(
            line.replace('.', '').replace(',', '').replace('\n',
                                                           ''))['tag_logits']
        tag_ids = np.argmax(tag_logits2, axis=-1)
        res = [model2.vocab.get_token_from_index(i, 'labels') for i in tag_ids]
        for i in range(len(res)):
            train_write_file.write(res[i] + ' ')
        # train_write_file.write(str(tag_logits2))
        train_write_file.write('\n')
        train_write_file.flush()
    train_read_file.close()
    train_write_file.close()
    print('finish')


# generate_res_file()
Exemplo n.º 29
0
 def __init__(self, num_classes: int, input_dim: int,
              output_dim: int) -> None:
     super().__init__()
     self.embedder = Embedding(num_classes, input_dim)
     self.decoder_cell = GRUCell(input_dim, output_dim)
     self.output_projection_layer = Linear(output_dim, num_classes)
     self.recall = UnigramRecall()
Exemplo n.º 30
0
    def __init__(
        self,
        vocab: Vocabulary,
        number_of_branch_map: Dict[str, int],
        child_state_generator: ChildStateGenerator,
        target_namespace: str = "equation_vocab",
        embedding_size: int = 128,
        hidden_size: int = 512,
        beam_size: int = 5,
        initializer: InitializerApplicator = InitializerApplicator(),
    ) -> None:
        super().__init__(vocab, target_namespace)

        # Since nodes could have arbitrary number of children. We need have a map to look it up.
        self.number_of_branch_map = number_of_branch_map

        # GTS modules
        self._source_vocab_size = self.vocab.get_vocab_size("tokens")
        self.encoder = EncoderSeq(input_size=self._source_vocab_size, embedding_size=embedding_size, hidden_size=hidden_size,
                                  n_layers=2)
        self.predict = Prediction(hidden_size=hidden_size, op_nums=self.num_operations,
                                  input_size=self.num_constants)
        self._target_embedder = Embedding(
            num_embeddings=self.num_operations, embedding_dim=embedding_size)
        self.merge = Merge(hidden_size=hidden_size,
                           embedding_size=embedding_size)

        # The generator to generate arbitrary number of child states
        self.generator = child_state_generator

        # At prediction time, we'll use a beam search to find the best target sequence.
        self._beam_size = beam_size

        initializer(self)