def __init__(self, vocab, no_classes):
        super(ConvNLP, self).__init__()

        self.vocab = vocab
        self.voc_size = len(vocab)
        self.no_classes = no_classes
        self.embedding_dim = 128
        self.conv_dim = 100

        self.embedding = WordEmbedding(self.vocab, self.embedding_dim)

        self.conv1 = torch.nn.Conv2d(1, self.conv_dim, [3, self.embedding_dim], (2, 0))
        self.conv2 = torch.nn.Conv2d(1, self.conv_dim, [4, self.embedding_dim], (3, 0))
        self.conv3 = torch.nn.Conv2d(1, self.conv_dim, [5, self.embedding_dim], (4, 0))

        self.dropout = torch.nn.Dropout(p=0.2)
        self.fc1 = torch.nn.Linear(self.conv_dim * 3, 200)
        self.fc2 = torch.nn.Linear(200, 100)
        self.fc3 = torch.nn.Linear(100, self.no_classes)
class ParaphraseIdentificator:
    def __init__(self):
        self.model_name = None
        self.qqp_df = None
        self.word_embedding = None
        self.model = None

    def initialize_dataset_frame(self, path, test_rate=0.1):
        self.qqp_df = QQPDataFrame(path=path)
        self.qqp_df.preprocess()
        self.qqp_df.split_train_test(test_rate=test_rate)
        self.qqp_df.fit_tokenizer()

    def initialize_word_embedding(self, path):
        self.word_embedding = WordEmbedding(embfile=path)
        self.word_embedding.create_embedding_matrix(self.qqp_df.tokenizer)

    def train_and_test(self, path, epochs=10, batch_size=64):
        path += self.model_name + '_' + str(datetime.now().date())
        self.train(epochs=epochs, batch_size=batch_size)
        self.save(path)
        del self.model
        self.load(path)
        return self.test(batch_size=batch_size)

    def initialize_model(self):
        pass

    def train(self, epochs=10, batch_size=64):
        pass

    def test(self, batch_size=64):
        pass

    def predict(self, question1, question2):
        pass

    def save(self, path):
        pass

    def load(self, path):
        pass
class ConvNLP(torch.nn.Module):

    def __init__(self, vocab, no_classes):
        super(ConvNLP, self).__init__()

        self.vocab = vocab
        self.voc_size = len(vocab)
        self.no_classes = no_classes
        self.embedding_dim = 128
        self.conv_dim = 100

        self.embedding = WordEmbedding(self.vocab, self.embedding_dim)

        self.conv1 = torch.nn.Conv2d(1, self.conv_dim, [3, self.embedding_dim], (2, 0))
        self.conv2 = torch.nn.Conv2d(1, self.conv_dim, [4, self.embedding_dim], (3, 0))
        self.conv3 = torch.nn.Conv2d(1, self.conv_dim, [5, self.embedding_dim], (4, 0))

        self.dropout = torch.nn.Dropout(p=0.2)
        self.fc1 = torch.nn.Linear(self.conv_dim * 3, 200)
        self.fc2 = torch.nn.Linear(200, 100)
        self.fc3 = torch.nn.Linear(100, self.no_classes)

    def forward(self, X):
        embedding = self.embedding.embedAndPack(X, batch_first=True)
        embedding = torch.unsqueeze(embedding, 1)  # Channels is second dim, not last

        act1 = F.relu(self.conv1(embedding))
        act2 = F.relu(self.conv2(embedding))
        act3 = F.relu(self.conv3(embedding))

        acts = [act1, act2, act3]

        for i in range(len(acts)):
            acts[i] = torch.squeeze(acts[i], -1)
            acts[i] = F.max_pool1d(acts[i], acts[i].size(2))

        acts = torch.cat(acts, 2)
        acts = acts.view(acts.size(0), -1) # [batch_size, self.conv_dim * 3]

        act1 = self.dropout(self.fc1(acts))
        act2 = self.dropout(self.fc2(act1))
        act3 = self.fc3(act2)
        y_pred = F.softmax(act3)
        classes = torch.max(y_pred, 1)[1]

        return y_pred, classes
Exemplo n.º 4
0
def make_model(args, word_vocab_size, tag_vocab_size, num_labels):
    """Initiliaze a the BiAffine parser according to the specs in args."""
    # Embeddings
    if args.use_chars:
        if args.char_encoder == 'rnn':
            word_embedding = RecurrentCharEmbedding(word_vocab_size,
                                                    args.word_emb_dim,
                                                    padding_idx=PAD_INDEX)
        elif args.char_encoder == 'cnn':
            word_embedding = ConvolutionalCharEmbedding(
                word_vocab_size,
                padding_idx=PAD_INDEX,
                filter_factor=args.filter_factor)
            args.word_emb_dim = word_embedding.output_size  # CNN encoder is not so flexible
            print(
                'CNN character model produces word embeddings of dimension {}.'
                .format(args.word_emb_dim))
        elif args.char_encoder == 'transformer':
            raise NotImplementedError(
                'Transformer character econder not yet implemented.')
    else:
        word_embedding = nn.Embedding(word_vocab_size,
                                      args.word_emb_dim,
                                      padding_idx=PAD_INDEX)
        if args.use_glove:
            raise NotImplementedError('GloVe embeddings not yet implemented.')
    # Words, tags, or both
    if args.disable_tags:
        embedding = WordEmbedding(word_embedding, args.emb_dropout)
        embedding_dim = args.word_emb_dim
    elif args.disable_words:  # Experimental reasons
        tag_embedding = nn.Embedding(tag_vocab_size,
                                     args.tag_emb_dim,
                                     padding_idx=PAD_INDEX)
        embedding = TagEmbedding(tag_embedding, args.emb_dropout)
        embedding_dim = args.tag_emb_dim
    else:
        tag_embedding = nn.Embedding(tag_vocab_size,
                                     args.tag_emb_dim,
                                     padding_idx=PAD_INDEX)
        embedding = WordTagEmbedding(word_embedding, tag_embedding,
                                     args.emb_dropout)
        embedding_dim = args.word_emb_dim + args.tag_emb_dim

    # Encoder
    if args.encoder == 'rnn':
        encoder = RecurrentEncoder(args.rnn_type,
                                   embedding_dim,
                                   args.rnn_hidden,
                                   args.rnn_num_layers,
                                   args.batch_first,
                                   args.rnn_dropout,
                                   bidirectional=True)
        encoder_dim = 2 * args.rnn_hidden
    elif args.encoder == 'cnn':
        encoder = ConvolutionalEncoder(embedding_dim,
                                       args.cnn_num_layers,
                                       args.kernel_size,
                                       dropout=args.cnn_dropout)
        encoder_dim = embedding_dim
    elif args.encoder == 'transformer':
        encoder = TransformerEncoder(embedding_dim,
                                     args.N,
                                     args.d_model,
                                     args.d_ff,
                                     args.h,
                                     dropout=args.trans_dropout)
        encoder_dim = args.d_model
    elif args.encoder == 'none':
        encoder = NoEncoder()
        encoder_dim = embedding_dim

    # Initialize the model.
    model = BiAffineParser(embedding, encoder, args.encoder, encoder_dim,
                           args.mlp_arc_hidden, args.mlp_lab_hidden,
                           args.mlp_dropout, num_labels, nn.CrossEntropyLoss)

    # Initialize parameters with Glorot.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)

    return model
import numpy as np

from embedding import WordEmbedding
from qqp_dataframe import QQPDataFrame

qqp_df = QQPDataFrame(path='../train.csv')
qqp_df.split_train_test(test_rate=0.99)
qqp_df.fit_tokenizer()

word_embedding = WordEmbedding(
    embfile='../PassageQueryProject/glove.840B.300d.txt')
word_embedding.create_embedding_matrix(qqp_df.tokenizer)

q1, q2, d = qqp_df.get_train_data()
q1_seq_emb = word_embedding.sequences_to_embeddings(sequences=q1)

while True:
    # q1 = input('Q1:')
    # q2 = input('Q2:')

    questions = input('Q1+Q2:')

    q1 = questions.split('","')[0]
    q2 = questions.split('","')[1]
    print('Q1:', q1)
    print('Q2:', q2)

    q1, q2 = qqp_df.get_prediction_data(q1, q2)

    is_duplicate = np.dot(q1, q2.T)
    print('is_duplicate:', is_duplicate)
Exemplo n.º 6
0
        content = [row[0:len(row) - 2]] + [body.tokens[row[len(row) - 2]]
                                           ] + [row[len(row) - 1]]
        train_dataset.append(content)

    #Merge stances and bodies in validation dataset
    for row in validation.tokens:
        content = [row[0:len(row) - 2]] + [body.tokens[row[len(row) - 2]]
                                           ] + [row[len(row) - 1]]
        val_dataset.append(content)

    #Merge stances and bodies in test dataset
    for row in test.tokens:
        content = [row[0:len(row) - 1]] + [body.tokens[row[len(row) - 1]]]
        test_dataset.append(content)

    train_glove = WordEmbedding(train_dataset, 'glove.6B/glove.6B.50d.txt',
                                'train_stances.p')
    val_glove = WordEmbedding(val_dataset, 'glove.6B/glove.6B.50d.txt',
                              'val_stances.p')
    test_glove = WordEmbedding(test_dataset, 'glove.6B/glove.6B.50d.txt',
                               'test_stances.p')

#    #using Word2vec
#    w2v_model = gensim.models.Word2Vec(
#                                       a.tokens,
#                                       size=300, # Dimension of the word embedding
#                                       window=2, # The maximum distance between the current and predicted word within a sentence.
#                                       min_count=1, # Ignores all words with total frequency lower than this.
#                                       sg=1, # If 1, skip-gram is employed; otherwise, CBOW is used.
#                                       negative=10, # Number of negative samples to be drawn
#                                       iter=20, # Number of epochs over the corpus
#                                       )
Exemplo n.º 7
0
        # question1 = tokenizer.texts_to_sequences(qqp_df['question1'])
        # question1 = pad_sequences(question1, maxlen=50)
        #
        # question2 = tokenizer.texts_to_sequences(qqp_df['question2'])
        # question2 = pad_sequences(question2, maxlen=50)
        #
        # is_duplicate = np.asarray(qqp_df['is_duplicate'])
        #
        # print('Duplicated Rate:', is_duplicate.sum(), '/', len(is_duplicate), '=', is_duplicate.sum() / len(is_duplicate))

        qqp_df = QQPDataFrame(path='../train.csv')
        qqp_df.split_train_test(test_rate=0.95)
        qqp_df.fit_tokenizer()
        question1, question2, is_duplicate = qqp_df.get_train_data()

        word_embedding = WordEmbedding(
            embfile='../PassageQueryProject/glove.840B.300d.txt')
        word_embedding.create_embedding_matrix(qqp_df.tokenizer)

        model = create_network(word_embedding=word_embedding, input_length=50)

        # model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
        model.compile(loss=[losses.binary_crossentropy],
                      optimizer='adam',
                      metrics=[metrics.binary_accuracy])
        model.summary()

        model.fit(x=[question1, question2],
                  y=is_duplicate,
                  batch_size=64,
                  epochs=1,
                  validation_split=0.1,
Exemplo n.º 8
0
import os
import pandas as pd
from transformer import Transformer

fpath = os.path.join(DATA_PATH, RAW_DATA)
df = pd.read_csv(fpath)

# Drop duplicates reivews
df = df[~df.duplicated('description')]

descriptions = df['description'].tolist()

FT = FilteredTokenizer()
Tokens = FT.filter_and_tokenize(descriptions, mode=TOKEN_FILTERS, tokenizer=TOKENIZER, filter_fpath=CUSTOM_FILTER_PATH)

WordEmbedding_ = WordEmbedding()
WordEmbedding_.load()

print("====== Examples of things you can do with the embeddings =======")
print(WordEmbedding_.word_vectors.most_similar(positive=['woman', 'king'], negative=['man']))
print(WordEmbedding_.word_vectors.most_similar("dont"))
print(WordEmbedding_.word_vectors.most_similar("a"))

matched_tokens, unmatched_tokens = WordEmbedding_.check_embedding_coverage(list_tokens=Tokens, verbose=True)
# Then you will get a file named <embedding file name> + <date time> + unmatched tokens
# this is a file with count distinct unmatched tokens, sorted in descending order

# Then you are able to see these attributes:
print("WordEmbedding_.coverage", WordEmbedding_.coverage)
# print("WordEmbedding_.wordvec_map", WordEmbedding_.wordvec_map)
print("You can get a word vector of the word 'hello' by calling: WordEmbedding_.word_vectors.get_vector('hello')", 
 def initialize_word_embedding(self, path):
     self.word_embedding = WordEmbedding(embfile=path)
     self.word_embedding.create_embedding_matrix(self.qqp_df.tokenizer)